In [1]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from PyPDF2 import PdfFileReader, PdfFileWriter
import os
from sklearn import preprocessing
import subprocess
import matplotlib.pyplot as plt
from itertools import islice
import re

In [2]:
%matplotlib inline

In [3]:
pdf_file_path = 'parsers/pdfs/West Bengal/2017-18/2017_bp11_Demand Nos.1-5.pdf'

In [4]:
pdf = PdfFileReader(open(pdf_file_path, 'rb'))

In [5]:
def get_page_image_from_pdf(pdf, page_num, image_file_name):
    page_layout = pdf.getPage(page_num)['/MediaBox']
    command = "convert -density 300 '%s'[%s] '%s'" % (pdf_file_path,
                                                      page_num,
                                                      image_file_name)
    subprocess.check_output(command, shell=True)
    return cv2.imread(image_file_name, 0)

In [6]:
img_page2 = get_page_image_from_pdf(pdf, 2, 'west_bengal_demand_1_5_page_1.png')
img_page4 = get_page_image_from_pdf(pdf, 4, 'west_bengal_demand_1_5_page_1.png')

In [7]:
plt.figure(figsize=(30,20))
plt.imshow(img_page2, cmap='gray')


Out[7]:
<matplotlib.image.AxesImage at 0x7f514405d410>

In [8]:
plt.figure(figsize=(30,20))
plt.imshow(img_page4, cmap='gray')


Out[8]:
<matplotlib.image.AxesImage at 0x7f5143f59990>

RLSA Implementation

http://crblpocr.blogspot.in/2007/06/run-length-smoothing-algorithm-rlsa.html

The Run Length Smoothing Algorithm (RLSA) is a method that can be used for Block segmentation and text discrimination. The method developed for the Document Analysis System consists of two steps. First, a segmentation procedure subdivides the area of a document into regions (blocks), each of which should contain only one type of data (text, graphic, halftone image, etc.). Next, some basic features of these blocks are calculated.

The basic RLSA is applied to a binary sequence in which white pixels are represented by 0’s and black pixels by 1’s. The algorithm transforms a binary sequence x into an output sequence y according to the following rules:

  1. 0’s in x are changed to 1’s in y if the number of adjacent 0’s is less than or equal to a predefined limit C.
  2. 1’s in x are unchanged in y .

For example, with C = 4 the sequence x is mapped into y as follows:

x : 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0
y : 1 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 1


In [9]:
ret,thresh1 = cv2.threshold(img_page2,0,1,cv2.THRESH_BINARY_INV)
plt.figure(figsize=(30,20))
plt.imshow(thresh1, cmap='gray')


Out[9]:
<matplotlib.image.AxesImage at 0x7f5143c8c1d0>

In [10]:
def rlsa(img, threshold):
    ret,thresh1 = cv2.threshold(img, 0, 1, cv2.THRESH_BINARY_INV)
    img_iter = np.nditer(thresh1, flags=['multi_index'])
    C_vertical, C_horizontal = threshold
    temp_thresh = thresh1.copy()
    while not img_iter.finished:
        x, y = img_iter.multi_index
        x_threshold = x + C_horizontal
        y_threshold = y + C_vertical
        neg_x_threshold = x - C_horizontal
        neg_y_threshold = y - C_vertical
        if (thresh1[x:x_threshold, y:y_threshold].any() 
            or thresh1[x:x_threshold, y:neg_y_threshold].any()
            or thresh1[x:neg_x_threshold, y:y_threshold].any()
            or thresh1[x:neg_x_threshold, y:neg_y_threshold].any()):
            temp_thresh[x, y] = 1
        else:
            temp_thresh[x, y] = 0
        img_iter.iternext()
    return temp_thresh

def plot_page(img):
    plt.figure(figsize=(30,20))
    plt.imshow(img, cmap='gray')

In [11]:
img_page4_rlsa = rlsa(img_page4, (15, 25))

In [12]:
plot_page(img_page4_rlsa)



In [13]:
n_comp, labels, stats, centroids = cv2.connectedComponentsWithStats(img_page4_rlsa)

In [14]:
n_comp


Out[14]:
167

In [15]:
plot_page(labels)



In [16]:
stats_columns = ["left", "top", "width", "height", "area"]
label_stats = pd.DataFrame(stats, columns=stats_columns)
label_stats['centroid_x'], label_stats['centroid_y'] = centroids[:, 0], centroids[:, 1]
# Ignore the label 0 since it is the background
label_stats.drop(0, inplace=True)

In [17]:
label_stats.head()


Out[17]:
left top width height area centroid_x centroid_y
1 0 0 15 25 375 7.000000 12.000000
2 140 0 2178 25 54450 1228.500000 12.000000
3 0 100 15 59 885 7.000000 129.000000
4 916 100 633 59 36608 1232.050016 128.580064
5 0 164 15 54 810 7.000000 190.500000

In [18]:
plt.scatter(label_stats.width, label_stats.height)


Out[18]:
<matplotlib.collections.PathCollection at 0x7f5143a48ed0>

In [19]:
label_stats.top.sort_index(ascending=False)


Out[19]:
166    3041
165    2957
164    2957
163    2686
162    2686
161    2686
160    2686
159    2664
158    2664
157    2630
156    2630
155    2630
154    2630
153    2608
152    2552
151    2496
150    2496
149    2496
148    2496
147    2496
146    2439
145    2383
144    2383
143    2383
142    2383
141    2383
140    2383
139    2383
138    2343
137    2343
       ... 
30      808
29      696
28      696
27      639
26      639
25      558
24      558
23      518
22      518
21      447
20      447
19      447
18      447
17      447
16      333
15      333
14      333
13      333
12      277
11      277
10      277
9       277
8       237
7       237
6       164
5       164
4       100
3       100
2         0
1         0
Name: top, Length: 166, dtype: int32

In [20]:
min_max_scaler = preprocessing.MinMaxScaler()
plt.figure(figsize=(10,20))
plt.scatter(label_stats.top, label_stats.left, s=min_max_scaler.fit_transform(label_stats.area) * 200)


DataConversionWarning: Data with input dtype int32 was converted to float64 by MinMaxScaler. [validation.py:429]
DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample. [data.py:321]
DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample. [data.py:356]
Out[20]:
<matplotlib.collections.PathCollection at 0x7f5143992d50>

In [21]:
label_stats[label_stats.top < 500]


Out[21]:
left top width height area centroid_x centroid_y
1 0 0 15 25 375 7.000000 12.000000
2 140 0 2178 25 54450 1228.500000 12.000000
3 0 100 15 59 885 7.000000 129.000000
4 916 100 633 59 36608 1232.050016 128.580064
5 0 164 15 54 810 7.000000 190.500000
6 843 164 778 54 38447 1230.804276 188.837309
7 0 237 15 27 405 7.000000 250.000000
8 153 237 2158 27 58266 1231.500000 250.000000
9 0 277 15 163 2445 7.000000 358.000000
10 1589 277 123 59 5898 1649.495083 304.985419
11 1866 277 135 51 6236 1932.292495 303.819436
12 2167 277 123 59 5898 2227.495083 304.985419
13 1276 333 174 107 14721 1362.311460 392.252972
14 1564 333 174 107 15580 1649.606290 389.952824
15 1853 333 174 107 15566 1938.543878 389.991006
16 2142 333 174 107 15567 2228.001670 390.047472
17 0 447 15 50 750 7.000000 471.500000
18 1351 447 60 50 2663 1378.298911 473.266617
19 1639 447 61 50 2692 1666.650817 473.360698
20 1928 447 61 50 2692 1955.650817 473.360698
21 2217 447 61 50 2692 2244.650817 473.360698

In [22]:
label_stats.iloc[36:46]


Out[22]:
left top width height area centroid_x centroid_y
37 1552 921 188 111 16703 1655.143627 970.823026
38 1838 921 191 111 16931 1942.726478 970.544563
39 2127 921 191 111 16948 2231.533455 970.772481
40 1019 977 144 107 11462 1098.317222 1027.419037
41 228 1033 216 59 10137 334.728322 1059.755845
42 1411 1055 38 29 1099 1429.516833 1068.987261
43 1700 1055 38 29 1099 1718.516833 1068.987261
44 1989 1055 37 29 1070 2007.016822 1068.986916
45 2278 1055 37 29 1070 2296.016822 1068.986916
46 1019 1089 144 108 11463 1098.317107 1139.835471

In [23]:
centroids[36:46]


Out[23]:
array([[ 1364.20007162,   970.59466396],
       [ 1655.14362689,   970.8230258 ],
       [ 1942.72647806,   970.54456323],
       [ 2231.53345527,   970.77248053],
       [ 1098.31722213,  1027.41903682],
       [  334.72832199,  1059.75584492],
       [ 1429.51683348,  1068.98726115],
       [ 1718.51683348,  1068.98726115],
       [ 2007.01682243,  1068.98691589],
       [ 2296.01682243,  1068.98691589]])

In [24]:
waste_copy = img_page4.copy()
for centroid in centroids[36:46]:
    x, y = centroid
    cv2.circle(waste_copy, (int(x), int(y)), 20, (0,0,255), -1)
plot_page(waste_copy)



In [25]:
label_stats['top_str'] = label_stats.top.apply(lambda x: str(x))

In [26]:
count_top = label_stats.groupby('top_str')['left'].count()
count_top[count_top == 5]


Out[26]:
top_str
1258    5
1483    5
1596    5
1708    5
2496    5
447     5
Name: left, dtype: int64

In [27]:
grouped_top = label_stats.groupby('top_str').apply(lambda x: x.iloc[0:10]).sort_values('top')
grouped_top[grouped_top.top_str == '1258']


Out[27]:
left top width height area centroid_x centroid_y top_str
top_str
1258 65 1912 1258 117 111 10758 1974.298661 1313.873861 1258
64 1624 1258 116 111 10617 1686.041066 1313.745879 1258
63 1335 1258 116 111 10506 1397.271274 1314.158862 1258
66 2201 1258 117 111 10741 2263.345126 1313.779350 1258
62 225 1258 410 51 18614 425.245568 1285.120071 1258

In [28]:
(label_stats.left - (label_stats.width // 2) ).iloc[12:16].tolist()


Out[28]:
[1189, 1477, 1766, 2055]

In [29]:
(label_stats.left + label_stats.width + 10 ).iloc[12:16].tolist()


Out[29]:
[1460, 1748, 2037, 2326]

In [30]:
(label_stats.left + label_stats.width + 10)[61:66].tolist()


Out[30]:
[645, 1461, 1750, 2039, 2328]

In [31]:
# draw lines at 1411, 1700, 1989, 2278
# left - width [1102, 1390, 1679, 1968]
# left - width / 2 [1189, 1477, 1766, 2055]

def plot_lines(line_xs, img, y_start, y_end):
    waste_copy2 = img.copy()
    for line_x in line_xs:
        line_starting_points = (line_x, y_start)
        line_stopping_points = (line_x, y_end)
        cv2.line(waste_copy2, line_starting_points, line_stopping_points, (125,255,0), 3)
    plot_page(waste_copy2)

plot_lines([1461, 1750, 2039, 2328], img_page4, 333, 3000)



In [32]:
plot_lines([1000,1200,1461, 1750, 2039, 2328], img_page4, 333, 3000)


Try the same for page 2


In [33]:
rlsa_page2 = rlsa(img_page2, (20, 25))
n_comp, labels, stats, centroids = cv2.connectedComponentsWithStats(rlsa_page2)

In [34]:
def get_label_stats(stats, centroids):
    stats_columns = ["left", "top", "width", "height", "area"]
    label_stats = pd.DataFrame(stats, columns=stats_columns)
    label_stats['centroid_x'], label_stats['centroid_y'] = centroids[:, 0], centroids[:, 1]
    # Ignore the label 0 since it is the background
    label_stats.drop(0, inplace=True)
    return label_stats

page2_label_stats = get_label_stats(stats, centroids)

In [35]:
page2_label_stats['top_str'] = page2_label_stats.top.apply(lambda x: str(x))
page2_label_stats.groupby('top_str').apply(lambda x: x.iloc[0:10]).sort_values('top')


Out[35]:
left top width height area centroid_x centroid_y top_str
top_str
0 1 0 0 20 25 500 9.500000 12.000000 0
2 135 0 2183 25 54575 1226.000000 12.000000 0
103 4 885 103 690 62 42076 1229.127793 133.112392 103
3 0 103 20 62 1240 9.500000 133.500000 103
177 6 1049 177 360 57 19420 1225.853708 205.540783 177
5 0 177 20 57 1140 9.500000 205.000000 177
246 7 0 246 20 66 1320 9.500000 278.500000 246
8 901 246 658 66 34215 1225.751308 277.009908 246
315 9 0 315 20 66 1320 9.500000 347.500000 315
10 827 315 805 66 41069 1232.540627 346.063040 315
383 11 0 383 20 66 1320 9.500000 415.500000 383
12 798 383 864 66 44366 1228.446445 413.491277 383
502 13 0 502 20 66 1320 9.500000 534.500000 502
14 142 502 462 65 25409 379.711559 531.492739 502
15 1016 502 446 66 24421 1242.827566 532.310348 502
16 1869 502 448 65 24894 2098.311762 531.411304 502
580 17 0 580 20 27 540 9.500000 593.000000 580
18 143 580 2164 27 58428 1224.500000 593.000000 580
620 19 0 620 20 59 1180 9.500000 649.000000 620
20 1833 620 206 59 9843 1933.956416 648.331505 620
621 21 1593 621 174 51 8152 1678.513494 646.764475 621
22 2154 621 162 51 7692 2232.769501 646.766381 621
693 24 1251 693 1065 27 28755 1783.000000 706.000000 693
23 0 693 20 27 540 9.500000 706.000000 693
733 25 0 733 20 113 2260 9.500000 789.000000 733
26 618 733 320 58 14864 775.951023 760.459634 733
27 1553 733 214 113 19317 1671.421235 782.727390 733
28 1873 733 169 57 8747 1957.048588 759.089859 733
29 2103 733 214 113 19346 2221.338261 782.709604 733
789 30 570 789 149 51 6448 640.050093 816.051489 789
... ... ... ... ... ... ... ... ... ...
2312 109 1201 2312 1115 27 30105 1758.000000 2325.000000 2312
2352 110 0 2352 20 114 2280 9.500000 2408.500000 2352
111 1108 2352 110 51 5085 1163.480826 2377.670403 2352
112 1277 2352 216 111 19416 1395.550783 2402.243871 2352
113 1553 2352 215 111 19639 1669.992158 2402.199145 2352
114 1828 2352 215 111 19683 1944.925926 2402.218107 2352
115 2103 2352 215 111 19667 2220.060558 2402.283216 2352
2408 116 1070 2408 149 58 7042 1143.765124 2435.896904 2408
2480 118 1201 2480 1115 27 30105 1758.000000 2493.000000 2480
117 0 2480 20 27 540 9.500000 2493.000000 2480
2521 119 0 2521 20 57 1140 9.500000 2549.000000 2521
123 1827 2521 215 57 11284 1933.903314 2547.162708 2521
120 969 2521 249 51 11604 1090.725009 2546.832644 2521
121 1277 2521 215 57 11140 1384.866876 2547.031688 2521
122 1553 2521 214 57 11232 1659.334046 2547.156428 2521
124 2103 2521 214 57 11232 2209.386485 2547.143162 2521
2593 125 0 2593 20 27 540 9.500000 2606.000000 2593
126 1201 2593 1115 27 30105 1758.000000 2606.000000 2593
2633 132 2103 2633 215 111 19667 2220.060558 2683.283216 2633
131 1828 2633 215 111 19683 1944.925926 2683.218107 2633
130 1553 2633 215 111 19639 1669.992158 2683.199145 2633
129 1277 2633 216 111 19416 1395.550783 2683.243871 2633
128 1108 2633 110 51 5085 1163.480826 2658.670403 2633
127 0 2633 20 114 2280 9.500000 2689.500000 2633
2689 133 1070 2689 149 58 7042 1143.765124 2716.896904 2689
2762 134 0 2762 20 27 540 9.500000 2775.000000 2762
135 1201 2762 1115 27 30105 1758.000000 2775.000000 2762
2957 137 135 2957 2157 86 81564 1227.328956 2981.084621 2957
136 0 2957 20 134 2680 9.500000 3023.500000 2957
3041 138 1234 3041 34 50 1584 1249.729167 3066.557449 3041

138 rows × 8 columns


In [36]:
line_xs = (page2_label_stats.left + page2_label_stats.width + 20).iloc[125:130].tolist()
plot_lines(line_xs, img_page2, 333, 3000)



In [37]:
line_xs


Out[37]:
[2336, 40, 1238, 1513, 1788]

Column Detection

Need to figure out column selection that would work on both with and without columnar lines.

Thouhgts/Ideas

Left - Width will not work as it varies with document pages and needs to be adjusted.
Left + Width + Buffer works :D. Since its numbers

TODO

  • Table Boundaries
  • Row Boundaries
  • Table Headers

In [38]:
eroded_image = cv2.erode(img_page2, kernel = np.ones((5,5),np.uint8), iterations=2)
rlsa_eroded_image = rlsa(eroded_image, (15, 25))
plot_page(rlsa_eroded_image)



In [39]:
eroded_image_page4 = cv2.erode(img_page4, kernel = np.ones((5,5),np.uint8), iterations=2)
rlsa_eroded_image_page4 = rlsa(eroded_image_page4, (15, 25))
plot_page(rlsa_eroded_image_page4)



In [40]:
waste_copy = img_page4.copy()
for index, row in label_stats[(label_stats.width > 1250) & (label_stats.top > 0)].iterrows():
    cv2.line(waste_copy, (20, row['top']), (2400, row['top']), (0,255,0), 5)
plot_page(waste_copy)



In [41]:
waste_copy = img_page2.copy()
for index, row in page2_label_stats[(page2_label_stats.width > 1200) & (page2_label_stats.top > 0)].iterrows():
    cv2.line(waste_copy, (20, row['top']), (2400, row['top']), (0,255,0), 5)
plot_page(waste_copy)



In [42]:
# the current method
def draw_table_bounds(image_gray):
        table_bounds = None
        temp_image, contours, hierarchy = cv2.findContours(image_gray,
                                                           cv2.RETR_LIST,
                                                           cv2.CHAIN_APPROX_SIMPLE)
        best_match_contour_index = None
        max_contour_size = 0
        count = 0
        for contour in contours:
            if cv2.contourArea(contour) > max_contour_size:
                contour_size = cv2.contourArea(contour)
                x, y, w, h = cv2.boundingRect(contour)
                if x > 0 and y > 0 and contour_size > max_contour_size:
                    best_match_contour_index = count
                    max_contour_size = contour_size
            count += 1
        if best_match_contour_index:
            x, y, w, h = cv2.boundingRect(contours[best_match_contour_index])
            x = x - 10
            w = w + 10
            cv2.rectangle(image_gray, (x, y), (x+w, y+h), (125, 125, 0), 5)
            cv2.rectangle(image_gray, (x, y), (x+w, y+h), (125, 125, 0), 5)
        return image_gray

In [43]:
# plot_page(draw_table_bounds(img_page4))

Detect similarly shaped blocks to merge together.

How to detect similar blocks ??

- Vertical
    - Similar left most points
    - Similar right most points
- Horizontal
    - Similar top most points
    - Similar bottom most points
- Shorter then the parent or similar width.

In [44]:
page2_label_stats['right'] = page2_label_stats.left + page2_label_stats.width
page2_label_stats['bottom'] = page2_label_stats.top + page2_label_stats.height
label_stats['right'] = label_stats.left + label_stats.width
label_stats['bottom'] = label_stats.top + label_stats.height

In [45]:
page2_label_stats


Out[45]:
left top width height area centroid_x centroid_y top_str right bottom
1 0 0 20 25 500 9.500000 12.000000 0 20 25
2 135 0 2183 25 54575 1226.000000 12.000000 0 2318 25
3 0 103 20 62 1240 9.500000 133.500000 103 20 165
4 885 103 690 62 42076 1229.127793 133.112392 103 1575 165
5 0 177 20 57 1140 9.500000 205.000000 177 20 234
6 1049 177 360 57 19420 1225.853708 205.540783 177 1409 234
7 0 246 20 66 1320 9.500000 278.500000 246 20 312
8 901 246 658 66 34215 1225.751308 277.009908 246 1559 312
9 0 315 20 66 1320 9.500000 347.500000 315 20 381
10 827 315 805 66 41069 1232.540627 346.063040 315 1632 381
11 0 383 20 66 1320 9.500000 415.500000 383 20 449
12 798 383 864 66 44366 1228.446445 413.491277 383 1662 449
13 0 502 20 66 1320 9.500000 534.500000 502 20 568
14 142 502 462 65 25409 379.711559 531.492739 502 604 567
15 1016 502 446 66 24421 1242.827566 532.310348 502 1462 568
16 1869 502 448 65 24894 2098.311762 531.411304 502 2317 567
17 0 580 20 27 540 9.500000 593.000000 580 20 607
18 143 580 2164 27 58428 1224.500000 593.000000 580 2307 607
19 0 620 20 59 1180 9.500000 649.000000 620 20 679
20 1833 620 206 59 9843 1933.956416 648.331505 620 2039 679
21 1593 621 174 51 8152 1678.513494 646.764475 621 1767 672
22 2154 621 162 51 7692 2232.769501 646.766381 621 2316 672
23 0 693 20 27 540 9.500000 706.000000 693 20 720
24 1251 693 1065 27 28755 1783.000000 706.000000 693 2316 720
25 0 733 20 113 2260 9.500000 789.000000 733 20 846
26 618 733 320 58 14864 775.951023 760.459634 733 938 791
27 1553 733 214 113 19317 1671.421235 782.727390 733 1767 846
28 1873 733 169 57 8747 1957.048588 759.089859 733 2042 790
29 2103 733 214 113 19346 2221.338261 782.709604 733 2317 846
30 570 789 149 51 6448 640.050093 816.051489 789 719 840
... ... ... ... ... ... ... ... ... ... ...
109 1201 2312 1115 27 30105 1758.000000 2325.000000 2312 2316 2339
110 0 2352 20 114 2280 9.500000 2408.500000 2352 20 2466
111 1108 2352 110 51 5085 1163.480826 2377.670403 2352 1218 2403
112 1277 2352 216 111 19416 1395.550783 2402.243871 2352 1493 2463
113 1553 2352 215 111 19639 1669.992158 2402.199145 2352 1768 2463
114 1828 2352 215 111 19683 1944.925926 2402.218107 2352 2043 2463
115 2103 2352 215 111 19667 2220.060558 2402.283216 2352 2318 2463
116 1070 2408 149 58 7042 1143.765124 2435.896904 2408 1219 2466
117 0 2480 20 27 540 9.500000 2493.000000 2480 20 2507
118 1201 2480 1115 27 30105 1758.000000 2493.000000 2480 2316 2507
119 0 2521 20 57 1140 9.500000 2549.000000 2521 20 2578
120 969 2521 249 51 11604 1090.725009 2546.832644 2521 1218 2572
121 1277 2521 215 57 11140 1384.866876 2547.031688 2521 1492 2578
122 1553 2521 214 57 11232 1659.334046 2547.156428 2521 1767 2578
123 1827 2521 215 57 11284 1933.903314 2547.162708 2521 2042 2578
124 2103 2521 214 57 11232 2209.386485 2547.143162 2521 2317 2578
125 0 2593 20 27 540 9.500000 2606.000000 2593 20 2620
126 1201 2593 1115 27 30105 1758.000000 2606.000000 2593 2316 2620
127 0 2633 20 114 2280 9.500000 2689.500000 2633 20 2747
128 1108 2633 110 51 5085 1163.480826 2658.670403 2633 1218 2684
129 1277 2633 216 111 19416 1395.550783 2683.243871 2633 1493 2744
130 1553 2633 215 111 19639 1669.992158 2683.199145 2633 1768 2744
131 1828 2633 215 111 19683 1944.925926 2683.218107 2633 2043 2744
132 2103 2633 215 111 19667 2220.060558 2683.283216 2633 2318 2744
133 1070 2689 149 58 7042 1143.765124 2716.896904 2689 1219 2747
134 0 2762 20 27 540 9.500000 2775.000000 2762 20 2789
135 1201 2762 1115 27 30105 1758.000000 2775.000000 2762 2316 2789
136 0 2957 20 134 2680 9.500000 3023.500000 2957 20 3091
137 135 2957 2157 86 81564 1227.328956 2981.084621 2957 2292 3043
138 1234 3041 34 50 1584 1249.729167 3066.557449 3041 1268 3091

138 rows × 10 columns


In [46]:
label_2_label_map = np.zeros((page2_label_stats.shape[0], page2_label_stats.shape[0]))

In [47]:
label_2_label_map[0]


Out[47]:
array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [48]:
for index, value in enumerate(label_2_label_map[10]):
    base_label = page2_label_stats.iloc[13]
    print(base_label)
    similarly_aligned = page2_label_stats[(page2_label_stats.top > base_label.top) & 
                                          (page2_label_stats.right.between(base_label.right - 10, base_label.right + 10)) &
                                          (page2_label_stats.width.between(base_label.width - 20, base_label.width + 20))]
    print(similarly_aligned)
    similarly_aligned = similarly_aligned.append(base_label)
    break


left              142
top               502
width             462
height             65
area            25409
centroid_x    379.712
centroid_y    531.493
top_str           502
right             604
bottom            567
Name: 14, dtype: object
    left   top  width  height   area  centroid_x   centroid_y top_str  right  \
85   144  1889    457     108  32860  356.778119  1934.333932    1889    601   

    bottom  
85    1997  

In [49]:
similarly_aligned.index


Out[49]:
Int64Index([85, 14], dtype='int64')

In [50]:
page_layout = pdf.getPage(3)['/MediaBox']
if '/Rotate' in pdf.getPage(3) and pdf.getPage(3)['/Rotate'] == 90:
    page_width = float(page_layout[3])
    page_height = float(page_layout[2])
else:
    page_width = float(page_layout[2])
    page_height = float(page_layout[3])

In [51]:
image_height, image_width = img_page2.shape
horizontal_ratio = page_width/image_width
vertical_ratio = page_height/image_height

In [52]:
waste_copy = img_page2.copy()
for index, row in page2_label_stats[page2_label_stats.index.isin(similarly_aligned.index)].iterrows():
    print(row.left * horizontal_ratio, row.top * vertical_ratio)
    print(row.height * vertical_ratio, row.width * horizontal_ratio)
    cv2.rectangle(waste_copy, (row['left'], row['top']), (row['right'], row['bottom']), (125,125,0), 5)
plot_page(waste_copy)


(34.082291246470355, 120.49144811858609)
(15.601482326111745, 110.8874546187979)
(34.5623235175474, 453.403078677309)
(25.922462941847208, 109.68737394110529)

In [53]:
def find_similar_blocks(row, label_stats, similarity_map):
    similarly_aligned = label_stats[(label_stats.top > row.top) & 
                                    (label_stats.right.between(row.right - 10, row.right + 10)) &
                                    (label_stats.width.between(row.width - 20, row.width + 20))]
    positions = [[row['pos'] - 1] * len(similarly_aligned), similarly_aligned.index.tolist()]
    similarity_map[positions] = 1
    return None

In [54]:
def add_similarity_map(label_stats):
    similarity_map = np.zeros((label_stats.shape[0], label_stats.shape[0]))
    label_stats.apply(find_similar_blocks, args=[label_stats, similarity_map], axis=1)
    similarity_df = pd.DataFrame(similarity_map, columns=['label_{0}'.format(index) for index in range(label_stats.shape[0])],
                                 index=label_stats.index)
    similarity_df['count'] = similarity_df.apply(sum, axis=1)
    return pd.concat([label_stats, similarity_df], axis=1)

In [55]:
page2_label_stats['pos'] = page2_label_stats.index
similar_labels = add_similarity_map(page2_label_stats)

In [56]:
similar_labels[(similar_labels.left > 0) & (similar_labels['count'] > 1)]


Out[56]:
left top width height area centroid_x centroid_y top_str right bottom ... label_129 label_130 label_131 label_132 label_133 label_134 label_135 label_136 label_137 count
18 143 580 2164 27 58428 1224.500000 593.000000 580 2307 607 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0
20 1833 620 206 59 9843 1933.956416 648.331505 620 2039 679 ... 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 8.0
27 1553 733 214 113 19317 1671.421235 782.727390 733 1767 846 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 9.0
28 1873 733 169 57 8747 1957.048588 759.089859 733 2042 790 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0
29 2103 733 214 113 19346 2221.338261 782.709604 733 2317 846 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 9.0
37 1553 902 214 57 11192 1659.620622 928.225965 902 1767 959 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 8.0
39 2103 902 214 57 11226 2209.406378 928.139319 902 2317 959 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 8.0
62 1201 1443 1115 27 30105 1758.000000 1456.000000 1443 2316 1470 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 8.0
68 1103 1596 114 51 5405 1160.456614 1621.225532 1596 1217 1647 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
69 1279 1596 215 111 19647 1395.623912 1645.971802 1596 1494 1707 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 7.0
70 1552 1596 216 111 19835 1669.511772 1646.143181 1596 1768 1707 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 7.0
71 1827 1596 216 111 19739 1944.744668 1645.944830 1596 2043 1707 ... 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 7.0
72 2102 1596 216 111 19758 2219.727098 1645.939366 1596 2318 1707 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 7.0
73 1070 1652 149 58 7042 1143.765124 1679.896904 1652 1219 1710 ... 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 3.0
75 1201 1724 1115 27 30105 1758.000000 1737.000000 1724 2316 1751 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 7.0
78 1279 1764 213 57 11128 1384.561916 1790.102894 1764 1492 1821 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 6.0
79 1552 1764 215 57 11284 1658.860510 1790.116005 1764 1767 1821 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 6.0
80 1827 1764 215 57 11273 1933.878116 1790.075047 1764 2042 1821 ... 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 6.0
81 2102 1764 215 57 11264 2208.825994 1790.029652 1764 2317 1821 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 6.0
83 1201 1837 1115 27 30105 1758.000000 1850.000000 1837 2316 1864 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 6.0
86 1103 1946 114 51 5405 1160.456614 1971.225532 1946 1217 1997 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0
87 1277 1946 215 111 18661 1397.109426 1994.193559 1946 1492 2057 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5.0
88 1552 1946 216 111 19758 1669.740814 1995.957081 1946 1768 2057 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5.0
89 1827 1946 216 111 19715 1944.900178 1996.003145 1946 2043 2057 ... 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 5.0
90 2102 1946 216 111 19737 2219.742818 1995.971272 1946 2318 2057 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 5.0
91 1070 2002 149 58 7042 1143.765124 2029.896904 2002 1219 2060 ... 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 2.0
93 1201 2074 1115 27 30105 1758.000000 2087.000000 2074 2316 2101 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 5.0
96 1277 2114 215 57 11288 1383.735383 2140.092310 2114 1492 2171 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4.0
97 1552 2114 215 57 11312 1658.689003 2140.116160 2114 1767 2171 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4.0
98 1827 2114 215 57 11289 1933.839844 2140.082735 2114 2042 2171 ... 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 4.0
99 2102 2114 215 57 11315 2208.660539 2140.126911 2114 2317 2171 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 4.0
101 1201 2187 1115 27 30105 1758.000000 2200.000000 2187 2316 2214 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 4.0
104 1277 2239 215 57 11140 1384.866876 2265.031688 2239 1492 2296 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
105 1553 2239 214 57 11232 1659.334046 2265.156428 2239 1767 2296 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
106 1827 2239 215 57 11284 1933.903314 2265.162708 2239 2042 2296 ... 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
107 2103 2239 214 57 11232 2209.386485 2265.143162 2239 2317 2296 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 3.0
109 1201 2312 1115 27 30105 1758.000000 2325.000000 2312 2316 2339 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 3.0
112 1277 2352 216 111 19416 1395.550783 2402.243871 2352 1493 2463 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0
113 1553 2352 215 111 19639 1669.992158 2402.199145 2352 1768 2463 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0
114 1828 2352 215 111 19683 1944.925926 2402.218107 2352 2043 2463 ... 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0
115 2103 2352 215 111 19667 2220.060558 2402.283216 2352 2318 2463 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 2.0
118 1201 2480 1115 27 30105 1758.000000 2493.000000 2480 2316 2507 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2.0

42 rows × 150 columns

From these similar labels we need to figure out structure defining labels

For each parent label remove child label.

Maybe use Argmax on the label_x columns to get the indices and then generate the sets.   
Also ignore the labels with left == 0

In [57]:
similar_label_indices = similar_labels[(similar_labels.left > 0) & (similar_labels['count'] > 1)].index.tolist()
parent_label_indices = {}
marked_label_indices = []

In [58]:
# for index in similar_label_indices:

Sudden Realization, I might not need what I am currently doing for table boudaries and columns!!

Stupendo...


In [59]:
waste_copy = img_page2.copy()
for index, row in page2_label_stats[(page2_label_stats.width > 1200) & (page2_label_stats.top > 0)].iterrows():
    cv2.line(waste_copy, (row['left'], row['top']), (row['right'] + 50, row['top']), (0,255,0), 5)
plot_page(waste_copy)



In [60]:
waste_copy = img_page4.copy()
for index, row in label_stats[(label_stats.width > 1200) & (label_stats.top > 0)].iterrows():
    cv2.line(waste_copy, (row['left'], row['top']), (row['left'] + row['width'] + 50, row['top']), (0,255,0), 5)
plot_page(waste_copy)


In the same process remove lines that are in the range of 100 points vertically
and join the left most and right most points of two independent lines


In [61]:
possible_boundary_lines = page2_label_stats[(page2_label_stats.width > 1200) & (page2_label_stats.top > 0)]
possible_boundary_lines


Out[61]:
left top width height area centroid_x centroid_y top_str right bottom pos
18 143 580 2164 27 58428 1224.500000 593.000000 580 2307 607 18
41 143 974 2164 27 58428 1224.500000 987.000000 974 2307 1001 41
47 148 1162 2163 27 58401 1229.000000 1175.000000 1162 2311 1189 47
137 135 2957 2157 86 81564 1227.328956 2981.084621 2957 2292 3043 137

In [62]:
possible_boundary_lines['dist_from_next_line'] = possible_boundary_lines.top.shift(-1) - possible_boundary_lines.top
possible_boundary_lines['dist_from_prev_line'] = possible_boundary_lines.top - possible_boundary_lines.top.shift(1)
possible_boundary_lines


SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:1]
SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:2]
Out[62]:
left top width height area centroid_x centroid_y top_str right bottom pos dist_from_next_line dist_from_prev_line
18 143 580 2164 27 58428 1224.500000 593.000000 580 2307 607 18 394.0 NaN
41 143 974 2164 27 58428 1224.500000 987.000000 974 2307 1001 41 188.0 394.0
47 148 1162 2163 27 58401 1229.000000 1175.000000 1162 2311 1189 47 1795.0 188.0
137 135 2957 2157 86 81564 1227.328956 2981.084621 2957 2292 3043 137 NaN 1795.0

In [63]:
def get_possible_boundary_lines(label_stats):
    possible_boundary_lines = label_stats[(label_stats.width > 1200) & (label_stats.top > 0)]
    possible_boundary_lines['dist_from_next_line'] = possible_boundary_lines.top.shift(-1) - possible_boundary_lines.top
    possible_boundary_lines['dist_from_prev_line'] = possible_boundary_lines.top - possible_boundary_lines.top.shift(1)
    possible_boundary_lines.replace(pd.np.nan, pd.np.inf, inplace=True)
    return possible_boundary_lines[(possible_boundary_lines.dist_from_next_line > 100) & (possible_boundary_lines.dist_from_prev_line > 100)]

In [64]:
get_possible_boundary_lines(page2_label_stats)


SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:3]
SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:4]
SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:5]
Out[64]:
left top width height area centroid_x centroid_y top_str right bottom pos dist_from_next_line dist_from_prev_line
18 143 580 2164 27 58428 1224.500000 593.000000 580 2307 607 18 394.000000 inf
41 143 974 2164 27 58428 1224.500000 987.000000 974 2307 1001 41 188.000000 394.000000
47 148 1162 2163 27 58401 1229.000000 1175.000000 1162 2311 1189 47 1795.000000 188.000000
137 135 2957 2157 86 81564 1227.328956 2981.084621 2957 2292 3043 137 inf 1795.000000

In [65]:
get_possible_boundary_lines(label_stats)


SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:3]
SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:4]
SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:5]
Out[65]:
left top width height area centroid_x centroid_y top_str right bottom dist_from_next_line dist_from_prev_line
8 153 237 2158 27 58266 1231.500000 250.00000 237 2311 264 281.000000 inf
165 140 2957 2152 86 80811 1229.489475 2980.93809 2957 2292 3043 inf 2399.000000

In [66]:
def draw_table_bounding_box(img, label_stats):
    possible_boundary_lines = get_possible_boundary_lines(label_stats).reset_index()
    waste_copy = img.copy()
    tables = []
    if len(possible_boundary_lines) % 2 != 0:
        raise "Uneven Probable Bounding Lines"
    for index, row in enumerate(possible_boundary_lines.iterrows()):
        if index % 2 != 0:
            table_bounds = {}
            top_bound = possible_boundary_lines.iloc[index - 1]
            bottom_bound = possible_boundary_lines.iloc[index]
            cv2.rectangle(waste_copy, (int(top_bound['left']), int(top_bound['top'])), 
                          (int(bottom_bound['right']) + 50, int(bottom_bound['bottom'])), (125,125,0), 5)
            table_bounds['left'], table_bounds['top'] = int(top_bound['left']), int(top_bound['top'])
            table_bounds['right'], table_bounds['bottom'] = int(bottom_bound['right']) + 50, int(bottom_bound['bottom'])
            tables.append(table_bounds)
    plot_page(waste_copy)
    return tables

In [67]:
tables = draw_table_bounding_box(img_page2, page2_label_stats)


SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:3]
SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:4]
SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:5]

In [68]:
tables = draw_table_bounding_box(img_page4, label_stats)


SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:3]
SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:4]
SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:5]

Now combine the column output and table boundary to send to tabula and check if it works!!

java -jar parsers/tabula-0.9.2-jar-with-dependencies.jar --pages %s --area %s,%s,%s,%s --columns %s '%s'" % ( page_num+1, table_bounds["top"], table_bounds["left"], table_bounds["bottom"], table_bounds["right"], column_values, input_pdf_filepath)

In [69]:
tables = draw_table_bounding_box(img_page4, label_stats)
column_values = ','.join(['237','1000','1200', '1461', '1750', '2039'])
for table_bounds in tables:
    command = "java -jar parsers/tabula-0.9.2-jar-with-dependencies.jar --pages %s --area %s,%s,%s,%s --columns %s '%s'"% ( 5, table_bounds["top"], table_bounds["left"], table_bounds["bottom"], table_bounds["right"], column_values, pdf_file_path)
    print(command)


SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:3]
SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:4]
SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:5]
java -jar parsers/tabula-0.9.2-jar-with-dependencies.jar --pages 5 --area 237,153,3043,2342 --columns 237,1000,1200,1461,1750,2039 'parsers/pdfs/West Bengal/2017-18/2017_bp11_Demand Nos.1-5.pdf'

In [70]:
page_table_data = subprocess.check_output(command, shell=True)

In [71]:
page_table_data.split('\n')


Out[71]:
['"","Charged 1,65,653 1,32,000 1,71,000 1,76,000"\r',
 '"",Voted ... ... ... ...\r',
 '"",Charged ... ... ... ...\r',
 '"",Voted ... ... ... ...\r',
 '"",Charged ... ... ... ...\r',
 '"","Voted 8,400 9,000 9,000 9,000"\r',
 '"","Charged 16,300 18,000 30,000 31,000"\r',
 '"","Voted ... 13,15,000 20,55,000 30,23,000"\r',
 '"","Charged ... 9,000 12,000 18,000"\r',
 'e,"Voted 75,08,018 1,25,28,000 1,25,28,000 1,30,29,000"\r',
 '"","Charged 74,299 72,000 72,000 75,000"\r',
 '"","Voted 91,18,558 1,59,89,000 1,59,89,000 1,71,08,000"\r',
 '"","Charged 3,19,000 7,78,000 7,78,000 8,32,000"\r',
 'ce,"Voted 56,20,748 93,84,000 93,84,000 98,53,000"\r',
 '"","Charged 44,000 48,000 48,000 50,000"\r',
 '"",Voted ... ... ... ...\r',
 '"",Charged ... ... ... ...\r',
 '"",Voted ... ... ... ...\r',
 '"",Charged ... ... ... ...\r',
 '"",-----------------------------------------------------------------------------------------\r',
 'Total - 2011-02-101,"-NP-001-01 5,13,70,408 5,90,68,000 7,04,26,000 7,44,35,000"\r',
 '"",-----------------------------------------------------------------------------------------\r',
 '"","Voted 5,07,51,156 5,80,11,000 6,93,15,000 7,32,53,000"\r',
 '"","Charged 6,19,252 10,57,000 11,11,000 11,82,000"\r',
 '"",----------------------------------------------------------------------------------------\r',
 '"","Voted 1,48,25,315 1,76,00,000 1,76,00,000 1,93,60,000"\r',
 '"","Charged 3,85,986 12,00,000 12,00,000 12,00,000"\r',
 '"","Voted 10,36,05,646 18,00,00,000 18,00,00,000 18,00,00,000"\r',
 '"","Charged 1,21,328 15,00,000 15,00,000 16,35,000"\r',
 'nder WBHS 2008,Voted ... ... ... ...\r',
 '"",Charged ... ... ... ...\r',
 '"",Budget Publication No. 11\r',
 '"",3\r',
 '']
  • Table Bounding Box -- Done
  • Columns -- Need Improvement
  • Rows -- Not Started

And Tabula isn't giving good results for West Bengal

We are able to get text of blocks using pdftotext from poppler-0.56.0

pdftotext -f 3 -l 3 -x 34 -y 453 -W 110 -H 26 parsers/pdfs/West\ Bengal/2017-18/2017_bp11_Demand\ Nos.1-5.pdf -

x, y, w and h all need to be multiplied with their respective vertical and horizontal ratios for it to work.

Lets explore how we can build rows.

Also need to figure out a better table bounding box technique. The thought is to figure out how to decide the amount content between 2 given lines. Maybe some kind of measure of content within given 2 lines (Ratio of Sum of black pixels and total pixels)


In [72]:
page2_tables = draw_table_bounding_box(img_page2, page2_label_stats)


SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:3]
SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:4]
SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:5]

In [73]:
def get_text(page_num, pdf_file_name, x, y, w, h):
    command = 'pdftotext -enc UTF-8  -f {0} -l {0} -x {1} -y {2} -W {3} -H {4} "{5}" -'.format(page_num + 1,
                                                                                 int(x), 
                                                                                 int(y),
                                                                                 int(w),
                                                                                 int(h),
                                                                                 pdf_file_name)
    return subprocess.check_output(command, shell=True)

def get_text_data(row, page_num, pdf_file_path, horizontal_ratio, vertical_ratio):
    x = (row['left'] * horizontal_ratio)
    y = (row['top'] * vertical_ratio)
    width = (row['width'] * horizontal_ratio) + 5
    hieght = (row['height'] * vertical_ratio) + 5
    text = get_text(page_num, pdf_file_path, x, y, width, hieght)
    row['text'] = text.strip().replace('-','')
    row['text_length'] = len(row['text'])
    row['possible_row_merger'] = '\n' in row['text']
    text_matched = re.findall('[a-zA-Z_]+', row['text'])
    comma_sep_matcher = re.compile('^[1-9].(,[0-9]).*$')
    if comma_sep_matcher.match(row['text'].replace('\n', ' ')):
        row['comma_separated_numbers_present'] = True
    else:
        row['comma_separated_numbers_present'] = False
    if len(text_matched) > 0:
        row['is_text'] = True
    else:
        row['is_text'] = False
        
    try:
        row['number'] = int(row['text'].replace(',',''))
    except:
        row['number'] = None
    return row

def mark_number_cells(row):
    if row['comma_separated_numbers_present'] or row['is_text'] is False:
        row['label'] = 'cell_values'
    return row


def mark_text_cells(row, labels):
    '''
    Detect cells below, left and right. Based on these cells make guesses of labels.
    '''
    left, right = row['left'], row['right']
    top, bottom = row['top'], row['bottom']
    if row['label'] != 'cell_values':        
        if (labels[(labels.right.between(right - 10, right + 10)) & 
                     (labels.top > top) & (labels.label == 'cell_values')].shape[0] > 1):
            row['label'] = 'cell_header'
        elif (labels[(labels.top.between(top - 10, top + 10)) & 
                     (labels.left > left) & (labels.label == 'cell_values')].shape[0] > 1):
            row['label'] = 'cell_grouping'
        elif (labels[(labels.bottom.between(bottom - 10, bottom + 10)) & 
                     (labels.left > left) & (labels.label == 'cell_values')].shape[0] > 1):
            row['label'] = 'cell_grouping'
    return row
    

def get_possible_horizontal_lines(text_labels):
    possible_line_points = text_labels.bottom.value_counts()
    possible_line_points = possible_line_points[possible_line_points > 1]
    return possible_line_points.index


for table_bounds in page2_tables:
    table = rlsa_page2[table_bounds['top']:table_bounds['bottom'],
                        table_bounds['left']:table_bounds['right']]
    labels_in_table = page2_label_stats[(page2_label_stats.top > (table_bounds['top']  + 5)) & 
                                        (page2_label_stats.bottom < (table_bounds['bottom'] - 5)) &
                                        (page2_label_stats.left > (table_bounds['left'] - 5)) &
                                        (page2_label_stats.right < (table_bounds['right'] + 5))]
    labels_in_table = labels_in_table.apply(get_text_data, axis=1, args=[2, pdf_file_path,
                                                                         horizontal_ratio, vertical_ratio])
    text_labels = labels_in_table[labels_in_table.text_length > 0].apply(mark_number_cells, axis=1)
    text_labels = text_labels.apply(mark_text_cells, axis=1, args=[text_labels])
#     possible_horizontal_lines = get_possible_horizontal_lines(text_labels)
    plot_page(rlsa(img_page2[table_bounds['top']:table_bounds['bottom'],
                        table_bounds['left']:table_bounds['right']], (15, 25)))
plot_page(img_page2)


  • Cross check if RLSA works better on zoomed images. Yes it Does perform differently, not to much with the thresholds we are using but there is certainly a difference.

Though We need to figure out which blocks to split, given the new combined information of text and position can we estimate which blocks should be combined or categorized also.

- One basic rule of thumb for all documents, if number is involved and the block next to it is also a number   
  we can consider it as a cell.
- If text is involved and it is positioned on the left side somewhere in the middle of the doc or towards   
  the end it would most probably be a cell.
- For text that are in the top region, directly above cells can be considered as Headers.
- Now the problem of merging and un merging of blocks generated needs to be Resolved, somehow.
- Also another problem to ponder about, groupings or multi level structure - We have already tagged all 

In [74]:
text_labels


Out[74]:
area bottom centroid_x centroid_y comma_separated_numbers_present height is_text label left number pos possible_row_merger right text text_length top top_str width
49 6212 1261 1675.519961 1229.796523 False 59 True NaN 1613 NaN 49 False 1740 Budget 6 1202 1202 127
50 6578 1253 1943.899970 1228.496047 False 51 True NaN 1875 NaN 50 False 2015 Revised 7 1202 1202 140
51 6254 1261 2225.048129 1229.807963 False 59 True NaN 2162 NaN 51 False 2290 Budget 6 1202 1202 128
52 15406 1365 1401.864858 1316.907763 False 107 True cell_header 1313 NaN 52 True 1492 Actuals,\n20152016 17 1258 1258 179
53 16239 1365 1675.118111 1314.721535 False 107 True cell_header 1587 NaN 53 True 1766 Estimate,\n20162017 18 1258 1258 179
54 16225 1365 1950.056086 1314.757966 False 107 True cell_header 1862 NaN 54 True 2041 Estimate,\n20162017 18 1258 1258 179
55 16260 1365 2225.510578 1314.892005 False 107 True cell_header 2137 NaN 55 True 2316 Estimate,\n20172018 18 1258 1258 179
57 2913 1422 1417.744936 1398.115002 False 50 True NaN 1388 NaN 57 False 1453 Rs. 3 1372 1372 65
58 2913 1422 1692.744936 1398.115002 False 50 True NaN 1663 NaN 58 False 1728 Rs. 3 1372 1372 65
59 2942 1422 1967.092114 1398.202583 False 50 True NaN 1937 NaN 59 False 2003 Rs. 3 1372 1372 66
60 2942 1422 2242.092114 1398.202583 False 50 True NaN 2212 NaN 60 False 2278 Rs. 3 1372 1372 66
65 22148 1534 442.953766 1507.708868 False 51 True NaN 223 NaN 65 False 664 STATE LEGISLATURES 18 1483 1483 441
67 31709 1647 349.448264 1585.197042 False 108 True NaN 144 NaN 67 True 577 101 Legislative Assembly\nNPNon Plan 35 1539 1539 433
68 5405 1647 1160.456614 1621.225532 False 51 True cell_grouping 1103 NaN 68 False 1217 Voted 5 1596 1596 114
69 19647 1707 1395.623912 1645.971802 True 111 False cell_values 1279 NaN 69 True 1494 17,20,13,766\n11,26,566 22 1596 1596 215
70 19835 1707 1669.511772 1646.143181 True 111 False cell_values 1552 NaN 70 True 1768 26,26,46,000\n39,97,000 22 1596 1596 216
71 19739 1707 1944.744668 1645.944830 True 111 False cell_values 1827 NaN 71 True 2043 27,39,50,000\n40,51,000 22 1596 1596 216
72 19758 1707 2219.727098 1645.939366 True 111 False cell_values 2102 NaN 72 True 2318 28,79,98,000\n42,79,000 22 1596 1596 216
73 7042 1710 1143.765124 1679.896904 False 58 True cell_grouping 1070 NaN 73 False 1219 Charged 7 1652 1652 149
77 8887 1815 1120.465736 1789.383594 False 51 True cell_grouping 1027 NaN 77 False 1216 Total 101 10 1764 1764 189
78 11128 1821 1384.561916 1790.102894 True 57 False cell_values 1279 173140332.0 78 False 1492 17,31,40,332 12 1764 1764 213
79 11284 1821 1658.860510 1790.116005 True 57 False cell_values 1552 266643000.0 79 False 1767 26,66,43,000 12 1764 1764 215
80 11273 1821 1933.878116 1790.075047 True 57 False cell_values 1827 278001000.0 80 False 2042 27,80,01,000 12 1764 1764 215
81 11264 1821 2208.825994 1790.029652 True 57 False cell_values 2102 292277000.0 81 False 2317 29,22,77,000 12 1764 1764 215
85 32860 1997 356.778119 1934.333932 False 108 True NaN 144 NaN 85 True 601 103 Legislative Secretariat\nNPNon Plan 38 1889 1889 457
86 5405 1997 1160.456614 1971.225532 False 51 True cell_grouping 1103 NaN 86 False 1217 Voted 5 1946 1946 114
87 18661 2057 1397.109426 1994.193559 True 111 False cell_values 1277 NaN 87 True 1492 23,84,31,139\n4,92,764 21 1946 1946 215
88 19758 2057 1669.740814 1995.957081 True 111 False cell_values 1552 NaN 88 True 1768 34,07,78,000\n15,30,000 22 1946 1946 216
89 19715 2057 1944.900178 1996.003145 True 111 False cell_values 1827 NaN 89 True 2043 28,41,78,000\n15,30,000 22 1946 1946 216
90 19737 2057 2219.742818 1995.971272 True 111 False cell_values 2102 NaN 90 True 2318 32,70,68,000\n16,31,000 22 1946 1946 216
91 7042 2060 1143.765124 2029.896904 False 58 True cell_grouping 1070 NaN 91 False 1219 Charged 7 2002 2002 149
95 9013 2165 1121.750361 2139.285698 False 51 True cell_grouping 1027 NaN 95 False 1217 Total 103 10 2114 2114 190
96 11288 2171 1383.735383 2140.092310 True 57 False cell_values 1277 238923903.0 96 False 1492 23,89,23,903 12 2114 2114 215
97 11312 2171 1658.689003 2140.116160 True 57 False cell_values 1552 342308000.0 97 False 1767 34,23,08,000 12 2114 2114 215
98 11289 2171 1933.839844 2140.082735 True 57 False cell_values 1827 285708000.0 98 False 2042 28,57,08,000 12 2114 2114 215
99 11315 2171 2208.660539 2140.126911 True 57 False cell_values 2102 328699000.0 99 False 2317 32,86,99,000 12 2114 2114 215
103 15930 2290 1042.681293 2265.474451 False 51 True cell_grouping 874 NaN 103 False 1218 Grand Total Gross 18 2239 2239 344
104 11140 2296 1384.866876 2265.031688 True 57 False cell_values 1277 412064235.0 104 False 1492 41,20,64,235 12 2239 2239 215
105 11232 2296 1659.334046 2265.156428 True 57 False cell_values 1553 608951000.0 105 False 1767 60,89,51,000 12 2239 2239 214
106 11284 2296 1933.903314 2265.162708 True 57 False cell_values 1827 563709000.0 106 False 2042 56,37,09,000 12 2239 2239 215
107 11232 2296 2209.386485 2265.143162 True 57 False cell_values 2103 620976000.0 107 False 2317 62,09,76,000 12 2239 2239 214
111 5085 2403 1163.480826 2377.670403 False 51 True cell_grouping 1108 NaN 111 False 1218 Voted 5 2352 2352 110
112 19416 2463 1395.550783 2402.243871 True 111 False cell_values 1277 NaN 112 True 1493 41,04,44,905\n16,19,330 22 2352 2352 216
113 19639 2463 1669.992158 2402.199145 True 111 False cell_values 1553 NaN 113 True 1768 60,34,24,000\n55,27,000 22 2352 2352 215
114 19683 2463 1944.925926 2402.218107 True 111 False cell_values 1828 NaN 114 True 2043 55,81,28,000\n55,81,000 22 2352 2352 215
115 19667 2463 2220.060558 2402.283216 True 111 False cell_values 2103 NaN 115 True 2318 61,50,66,000\n59,10,000 22 2352 2352 215
116 7042 2466 1143.765124 2435.896904 False 58 True cell_grouping 1070 NaN 116 False 1219 Charged 7 2408 2408 149
120 11604 2572 1090.725009 2546.832644 False 51 True cell_grouping 969 NaN 120 False 1218 NP Non Plan 12 2521 2521 249
121 11140 2578 1384.866876 2547.031688 True 57 False cell_values 1277 412064235.0 121 False 1492 41,20,64,235 12 2521 2521 215
122 11232 2578 1659.334046 2547.156428 True 57 False cell_values 1553 608951000.0 122 False 1767 60,89,51,000 12 2521 2521 214
123 11284 2578 1933.903314 2547.162708 True 57 False cell_values 1827 563709000.0 123 False 2042 56,37,09,000 12 2521 2521 215
124 11232 2578 2209.386485 2547.143162 True 57 False cell_values 2103 620976000.0 124 False 2317 62,09,76,000 12 2521 2521 214
128 5085 2684 1163.480826 2658.670403 False 51 True cell_grouping 1108 NaN 128 False 1218 Voted 5 2633 2633 110
129 19416 2744 1395.550783 2683.243871 True 111 False cell_values 1277 NaN 129 True 1493 41,04,44,905\n16,19,330 22 2633 2633 216
130 19639 2744 1669.992158 2683.199145 True 111 False cell_values 1553 NaN 130 True 1768 60,34,24,000\n55,27,000 22 2633 2633 215
131 19683 2744 1944.925926 2683.218107 True 111 False cell_values 1828 NaN 131 True 2043 55,81,28,000\n55,81,000 22 2633 2633 215
132 19667 2744 2220.060558 2683.283216 True 111 False cell_values 2103 NaN 132 True 2318 61,50,66,000\n59,10,000 22 2633 2633 215
133 7042 2747 1143.765124 2716.896904 False 58 True cell_grouping 1070 NaN 133 False 1219 Charged 7 2689 2689 149

There are flaws in the current approach as I am just checking adjacent points based on points, it should rather look for adjacent block with respect to direction and alignment alone, then utilize the information of that block to decide whether the current block is a cell, header or . That should make it more robust.


In [75]:
text_labels[['text', 'label']][pd.isnull(text_labels.label)]


Out[75]:
text label
49 Budget NaN
50 Revised NaN
51 Budget NaN
57 Rs. NaN
58 Rs. NaN
59 Rs. NaN
60 Rs. NaN
65 STATE LEGISLATURES NaN
67 101 Legislative Assembly\nNPNon Plan NaN
85 103 Legislative Secretariat\nNPNon Plan NaN

Intervention

Currently we are able to detect number based cells, with a minor issue of splitting them wherever required but for this we have already shown that this is detectable in the text also so its not all bad, Cell headers are also working and lowest level of grouping is also working decently and the rule intuitively seems general enough.

What About Multi Level Groupings?

This needs to be detected once we have all the other tags are done. So the idea is that we look for these groupings left to the lowest level of grouping, the assumption is that this grouping will always lie somewhere left to the lower level groupings and this can go upto n'th level though I don't think this would go more then 2 levels in any case because its harder to understand more deeper levels.

This idea, seems general enough and there would be some quirks there, I am pretty sure. But what about grouping labels like "State Legislatures" which itself doesn't have any corresponding value but it encapsulates other groups. Its more like a column name from what I understand though.

More Assumptions :-

  • Text that has a header above or below, its most probably part of the header.
  • Text on left of lower level grouping would be its higher grouping.
  • Text with no other labels in its path on the right would be a category of the grouping mentioned under it.
  • Higher level groupings are always found on the left most areas.

Observations :-

  • White space around each type also varies

In [76]:
def combine_headers(row, labels):
    left, top = row['left'], row['top']
    right, bottom = row['right'], row['bottom']
    # labels in a radius of 15
    x_pos_axis = labels[(labels.left.between(right - 10, right + 15)) &
                        (labels.label == 'cell_header')]
    x_neg_axis = labels[(labels.right.between(left - 15, left + 10)) &
                        (labels.label == 'cell_header')]
    y_pos_axis = labels[(labels.bottom.between(top - 20, top + 10)) &
                        (labels.label == 'cell_header')]
    y_neg_axis = labels[(labels.top.between(bottom - 10, bottom + 15)) &
                        (labels.label == 'cell_header')]

    if (len(x_pos_axis) + len(x_neg_axis) + len(y_pos_axis) + len(y_neg_axis)) > 0:
        if pd.isnull(row['label']):
            row['label'] = 'cell_header'
    return row

def find_higher_level_groupings(row, cell_grouping_height_range):
    bottom, label = row['bottom'], row['label']
    min_range, max_range = cell_grouping_height_range
    if (bottom > min_range) and (bottom < max_range) and (pd.isnull(label)):
        row['label'] = 'cell_higher_grouping'
    return row

In [77]:
text_labels.apply(combine_headers, axis=1, args=[text_labels])


Out[77]:
area bottom centroid_x centroid_y comma_separated_numbers_present height is_text label left number pos possible_row_merger right text text_length top top_str width
49 6212 1261 1675.519961 1229.796523 False 59 True cell_header 1613 NaN 49 False 1740 Budget 6 1202 1202 127
50 6578 1253 1943.899970 1228.496047 False 51 True cell_header 1875 NaN 50 False 2015 Revised 7 1202 1202 140
51 6254 1261 2225.048129 1229.807963 False 59 True cell_header 2162 NaN 51 False 2290 Budget 6 1202 1202 128
52 15406 1365 1401.864858 1316.907763 False 107 True cell_header 1313 NaN 52 True 1492 Actuals,\n20152016 17 1258 1258 179
53 16239 1365 1675.118111 1314.721535 False 107 True cell_header 1587 NaN 53 True 1766 Estimate,\n20162017 18 1258 1258 179
54 16225 1365 1950.056086 1314.757966 False 107 True cell_header 1862 NaN 54 True 2041 Estimate,\n20162017 18 1258 1258 179
55 16260 1365 2225.510578 1314.892005 False 107 True cell_header 2137 NaN 55 True 2316 Estimate,\n20172018 18 1258 1258 179
57 2913 1422 1417.744936 1398.115002 False 50 True cell_header 1388 NaN 57 False 1453 Rs. 3 1372 1372 65
58 2913 1422 1692.744936 1398.115002 False 50 True cell_header 1663 NaN 58 False 1728 Rs. 3 1372 1372 65
59 2942 1422 1967.092114 1398.202583 False 50 True cell_header 1937 NaN 59 False 2003 Rs. 3 1372 1372 66
60 2942 1422 2242.092114 1398.202583 False 50 True cell_header 2212 NaN 60 False 2278 Rs. 3 1372 1372 66
65 22148 1534 442.953766 1507.708868 False 51 True NaN 223 NaN 65 False 664 STATE LEGISLATURES 18 1483 1483 441
67 31709 1647 349.448264 1585.197042 False 108 True NaN 144 NaN 67 True 577 101 Legislative Assembly\nNPNon Plan 35 1539 1539 433
68 5405 1647 1160.456614 1621.225532 False 51 True cell_grouping 1103 NaN 68 False 1217 Voted 5 1596 1596 114
69 19647 1707 1395.623912 1645.971802 True 111 False cell_values 1279 NaN 69 True 1494 17,20,13,766\n11,26,566 22 1596 1596 215
70 19835 1707 1669.511772 1646.143181 True 111 False cell_values 1552 NaN 70 True 1768 26,26,46,000\n39,97,000 22 1596 1596 216
71 19739 1707 1944.744668 1645.944830 True 111 False cell_values 1827 NaN 71 True 2043 27,39,50,000\n40,51,000 22 1596 1596 216
72 19758 1707 2219.727098 1645.939366 True 111 False cell_values 2102 NaN 72 True 2318 28,79,98,000\n42,79,000 22 1596 1596 216
73 7042 1710 1143.765124 1679.896904 False 58 True cell_grouping 1070 NaN 73 False 1219 Charged 7 1652 1652 149
77 8887 1815 1120.465736 1789.383594 False 51 True cell_grouping 1027 NaN 77 False 1216 Total 101 10 1764 1764 189
78 11128 1821 1384.561916 1790.102894 True 57 False cell_values 1279 173140332.0 78 False 1492 17,31,40,332 12 1764 1764 213
79 11284 1821 1658.860510 1790.116005 True 57 False cell_values 1552 266643000.0 79 False 1767 26,66,43,000 12 1764 1764 215
80 11273 1821 1933.878116 1790.075047 True 57 False cell_values 1827 278001000.0 80 False 2042 27,80,01,000 12 1764 1764 215
81 11264 1821 2208.825994 1790.029652 True 57 False cell_values 2102 292277000.0 81 False 2317 29,22,77,000 12 1764 1764 215
85 32860 1997 356.778119 1934.333932 False 108 True NaN 144 NaN 85 True 601 103 Legislative Secretariat\nNPNon Plan 38 1889 1889 457
86 5405 1997 1160.456614 1971.225532 False 51 True cell_grouping 1103 NaN 86 False 1217 Voted 5 1946 1946 114
87 18661 2057 1397.109426 1994.193559 True 111 False cell_values 1277 NaN 87 True 1492 23,84,31,139\n4,92,764 21 1946 1946 215
88 19758 2057 1669.740814 1995.957081 True 111 False cell_values 1552 NaN 88 True 1768 34,07,78,000\n15,30,000 22 1946 1946 216
89 19715 2057 1944.900178 1996.003145 True 111 False cell_values 1827 NaN 89 True 2043 28,41,78,000\n15,30,000 22 1946 1946 216
90 19737 2057 2219.742818 1995.971272 True 111 False cell_values 2102 NaN 90 True 2318 32,70,68,000\n16,31,000 22 1946 1946 216
91 7042 2060 1143.765124 2029.896904 False 58 True cell_grouping 1070 NaN 91 False 1219 Charged 7 2002 2002 149
95 9013 2165 1121.750361 2139.285698 False 51 True cell_grouping 1027 NaN 95 False 1217 Total 103 10 2114 2114 190
96 11288 2171 1383.735383 2140.092310 True 57 False cell_values 1277 238923903.0 96 False 1492 23,89,23,903 12 2114 2114 215
97 11312 2171 1658.689003 2140.116160 True 57 False cell_values 1552 342308000.0 97 False 1767 34,23,08,000 12 2114 2114 215
98 11289 2171 1933.839844 2140.082735 True 57 False cell_values 1827 285708000.0 98 False 2042 28,57,08,000 12 2114 2114 215
99 11315 2171 2208.660539 2140.126911 True 57 False cell_values 2102 328699000.0 99 False 2317 32,86,99,000 12 2114 2114 215
103 15930 2290 1042.681293 2265.474451 False 51 True cell_grouping 874 NaN 103 False 1218 Grand Total Gross 18 2239 2239 344
104 11140 2296 1384.866876 2265.031688 True 57 False cell_values 1277 412064235.0 104 False 1492 41,20,64,235 12 2239 2239 215
105 11232 2296 1659.334046 2265.156428 True 57 False cell_values 1553 608951000.0 105 False 1767 60,89,51,000 12 2239 2239 214
106 11284 2296 1933.903314 2265.162708 True 57 False cell_values 1827 563709000.0 106 False 2042 56,37,09,000 12 2239 2239 215
107 11232 2296 2209.386485 2265.143162 True 57 False cell_values 2103 620976000.0 107 False 2317 62,09,76,000 12 2239 2239 214
111 5085 2403 1163.480826 2377.670403 False 51 True cell_grouping 1108 NaN 111 False 1218 Voted 5 2352 2352 110
112 19416 2463 1395.550783 2402.243871 True 111 False cell_values 1277 NaN 112 True 1493 41,04,44,905\n16,19,330 22 2352 2352 216
113 19639 2463 1669.992158 2402.199145 True 111 False cell_values 1553 NaN 113 True 1768 60,34,24,000\n55,27,000 22 2352 2352 215
114 19683 2463 1944.925926 2402.218107 True 111 False cell_values 1828 NaN 114 True 2043 55,81,28,000\n55,81,000 22 2352 2352 215
115 19667 2463 2220.060558 2402.283216 True 111 False cell_values 2103 NaN 115 True 2318 61,50,66,000\n59,10,000 22 2352 2352 215
116 7042 2466 1143.765124 2435.896904 False 58 True cell_grouping 1070 NaN 116 False 1219 Charged 7 2408 2408 149
120 11604 2572 1090.725009 2546.832644 False 51 True cell_grouping 969 NaN 120 False 1218 NP Non Plan 12 2521 2521 249
121 11140 2578 1384.866876 2547.031688 True 57 False cell_values 1277 412064235.0 121 False 1492 41,20,64,235 12 2521 2521 215
122 11232 2578 1659.334046 2547.156428 True 57 False cell_values 1553 608951000.0 122 False 1767 60,89,51,000 12 2521 2521 214
123 11284 2578 1933.903314 2547.162708 True 57 False cell_values 1827 563709000.0 123 False 2042 56,37,09,000 12 2521 2521 215
124 11232 2578 2209.386485 2547.143162 True 57 False cell_values 2103 620976000.0 124 False 2317 62,09,76,000 12 2521 2521 214
128 5085 2684 1163.480826 2658.670403 False 51 True cell_grouping 1108 NaN 128 False 1218 Voted 5 2633 2633 110
129 19416 2744 1395.550783 2683.243871 True 111 False cell_values 1277 NaN 129 True 1493 41,04,44,905\n16,19,330 22 2633 2633 216
130 19639 2744 1669.992158 2683.199145 True 111 False cell_values 1553 NaN 130 True 1768 60,34,24,000\n55,27,000 22 2633 2633 215
131 19683 2744 1944.925926 2683.218107 True 111 False cell_values 1828 NaN 131 True 2043 55,81,28,000\n55,81,000 22 2633 2633 215
132 19667 2744 2220.060558 2683.283216 True 111 False cell_values 2103 NaN 132 True 2318 61,50,66,000\n59,10,000 22 2633 2633 215
133 7042 2747 1143.765124 2716.896904 False 58 True cell_grouping 1070 NaN 133 False 1219 Charged 7 2689 2689 149

In [78]:
grouping_height_range = text_labels[text_labels.label == 'cell_grouping'].aggregate({'top': [min, max]}).values.flatten()
text_labels = text_labels.apply(combine_headers, axis=1, args=[text_labels]).apply(find_higher_level_groupings, axis=1, args=[grouping_height_range])

In [79]:
page4_tables = draw_table_bounding_box(img_page4, label_stats)
for table_bounds in page4_tables:
    table = rlsa_eroded_image_page4[table_bounds['top']:table_bounds['bottom'],
                        table_bounds['left']:table_bounds['right']]
    labels_in_table = label_stats[(label_stats.top > (table_bounds['top']  + 5)) & 
                                  (label_stats.bottom < (table_bounds['bottom'] - 5)) &
                                  (label_stats.left > (table_bounds['left'] - 5)) &
                                  (label_stats.right < (table_bounds['right'] + 5))]
    labels_in_table = labels_in_table.apply(get_text_data, axis=1, args=[4, pdf_file_path,
                                                                         horizontal_ratio, vertical_ratio])
    text_labels = labels_in_table[labels_in_table.text_length > 0].apply(mark_number_cells, axis=1)
    text_labels = text_labels.apply(mark_text_cells, axis=1, args=[text_labels])
    grouping_height_range = text_labels[text_labels.label == 'cell_grouping'].aggregate({'top': [min, max]}).values.flatten()
    text_labels = text_labels.apply(combine_headers, axis=1, args=[text_labels]).apply(find_higher_level_groupings, axis=1, args=[grouping_height_range])


SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:3]
SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:4]
SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:5]

In [80]:
text_labels[['text', 'label']].to_csv('page4_tags.csv')

there are some odd results, which makes the problem not easily solvable with just heuristics. Maybe we need to think about it from another angle, rather then trying to say what each block is why not focus on the important parts and let everything fall into place.. To expand on this, currently we are able to : -

- Get headers.
- Get Values

Based on these two lets draw the outline structure of the table with rows and columns, after that figure out in which row the other text columns lie, so instead of figuring out groupings, grouping levels, headers why not just dumbify the problem by trying to put each text column into some row.

Todo :-

  • Implement the new idea
  • create a class/func that works end to end to test out other pages.

In [81]:
def extract_table_attributes(page_num, pdf):
    img_page = get_page_image_from_pdf(pdf, page_num, 'west_bengal_demand_1_5_page_1.png')
    image_height, image_width = img_page.shape
    horizontal_ratio = page_width / image_width
    vertical_ratio = page_height / image_height
    img_rlsa = rlsa(img_page, (20, 25))
    n_comp, labels, stats, centroids = cv2.connectedComponentsWithStats(img_rlsa)
    label_stats = get_label_stats(stats, centroids)
    label_stats['right'] = label_stats.left + label_stats.width
    label_stats['bottom'] = label_stats.top + label_stats.height
    tables = draw_table_bounding_box(img_page, label_stats)
    if len(tables) == 0: 
        print("No Tables found on page number {0}".format(page_num))
    all_text_labels = pd.DataFrame()
    index = 1
    for table_bounds in tables:
        table = img_rlsa[table_bounds['top']:table_bounds['bottom'],
                         table_bounds['left']:table_bounds['right']]
        labels_in_table = label_stats[(label_stats.top > (table_bounds['top']  + 5)) & 
                                      (label_stats.bottom < (table_bounds['bottom'] - 5)) &
                                      (label_stats.left > (table_bounds['left'] - 5)) &
                                      (label_stats.right < (table_bounds['right'] + 5))]
        labels_in_table = labels_in_table.apply(get_text_data, axis=1, args=[page_num, pdf_file_path,
                                                                             horizontal_ratio, vertical_ratio])
        text_labels = labels_in_table[labels_in_table.text_length > 0].apply(mark_number_cells, axis=1)
        text_labels = text_labels.apply(mark_text_cells, axis=1, args=[text_labels])
        grouping_height_range = text_labels[text_labels.label == 'cell_grouping'].aggregate({'top': [min, max]}).values.flatten()
        text_labels = text_labels.apply(combine_headers, axis=1, args=[text_labels]).apply(find_higher_level_groupings, axis=1, args=[grouping_height_range])
        text_labels['page_num'] = page_num
        text_labels['table_num'] = index
        index += 1
        all_text_labels = pd.concat([all_text_labels, text_labels])
    return all_text_labels

In [82]:
extract_table_attributes(2, pdf)


SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:3]
SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:4]
SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:5]
Out[82]:
area bottom centroid_x centroid_y comma_separated_numbers_present height is_text label left number possible_row_merger right text text_length top width page_num table_num
20 9843.0 679.0 1933.956416 648.331505 False 59.0 True cell_header 1833.0 NaN False 2039.0 Charged Rs. 11 620.0 206.0 2 1
21 8152.0 672.0 1678.513494 646.764475 False 51.0 True cell_header 1593.0 NaN False 1767.0 Voted Rs. 9 621.0 174.0 2 1
22 7692.0 672.0 2232.769501 646.766381 False 51.0 True cell_header 2154.0 NaN False 2316.0 Total Rs. 9 621.0 162.0 2 1
26 14864.0 791.0 775.951023 760.459634 False 58.0 True cell_grouping 618.0 NaN False 938.0 Gross Expenditure 17 733.0 320.0 2 1
27 19317.0 846.0 1671.421235 782.727390 True 113.0 False cell_values 1553.0 NaN True 1767.0 61,50,66,000\n2,61,000 21 733.0 214.0 2 1
28 8747.0 790.0 1957.048588 759.089859 True 57.0 False cell_values 1873.0 5910000.0 False 2042.0 59,10,000 9 733.0 169.0 2 1
29 19346.0 846.0 2221.338261 782.709604 True 113.0 False cell_values 2103.0 NaN True 2317.0 62,09,76,000\n2,61,000 21 733.0 214.0 2 1
30 6448.0 840.0 640.050093 816.051489 False 51.0 True cell_grouping 570.0 NaN False 719.0 Deduct 7 789.0 149.0 2 1
31 8624.0 840.0 813.858071 816.339402 False 51.0 True cell_grouping 721.0 NaN False 911.0 Recoveries 10 789.0 190.0 2 1
32 1311.0 840.0 2016.483600 824.467582 False 30.0 False cell_values 1995.0 NaN False 2039.0 ... 3 810.0 44.0 2 1
36 13501.0 960.0 857.588253 929.038368 False 58.0 True cell_grouping 720.0 NaN False 1004.0 Net Expenditure 15 902.0 284.0 2 1
37 11192.0 959.0 1659.620622 928.225965 True 57.0 False cell_values 1553.0 614805000.0 False 1767.0 61,48,05,000 12 902.0 214.0 2 1
38 8747.0 959.0 1957.048588 928.089859 True 57.0 False cell_values 1873.0 5910000.0 False 2042.0 59,10,000 9 902.0 169.0 2 1
39 11226.0 959.0 2209.406378 928.139319 True 57.0 False cell_values 2103.0 620715000.0 False 2317.0 62,07,15,000 12 902.0 214.0 2 1
49 6212.0 1261.0 1675.519961 1229.796523 False 59.0 True cell_header 1613.0 NaN False 1740.0 Budget 6 1202.0 127.0 2 2
50 6578.0 1253.0 1943.899970 1228.496047 False 51.0 True cell_header 1875.0 NaN False 2015.0 Revised 7 1202.0 140.0 2 2
51 6254.0 1261.0 2225.048129 1229.807963 False 59.0 True cell_header 2162.0 NaN False 2290.0 Budget 6 1202.0 128.0 2 2
52 15406.0 1365.0 1401.864858 1316.907763 False 107.0 True cell_header 1313.0 NaN True 1492.0 Actuals,\n20152016 17 1258.0 179.0 2 2
53 16239.0 1365.0 1675.118111 1314.721535 False 107.0 True cell_header 1587.0 NaN True 1766.0 Estimate,\n20162017 18 1258.0 179.0 2 2
54 16225.0 1365.0 1950.056086 1314.757966 False 107.0 True cell_header 1862.0 NaN True 2041.0 Estimate,\n20162017 18 1258.0 179.0 2 2
55 16260.0 1365.0 2225.510578 1314.892005 False 107.0 True cell_header 2137.0 NaN True 2316.0 Estimate,\n20172018 18 1258.0 179.0 2 2
57 2913.0 1422.0 1417.744936 1398.115002 False 50.0 True cell_header 1388.0 NaN False 1453.0 Rs. 3 1372.0 65.0 2 2
58 2913.0 1422.0 1692.744936 1398.115002 False 50.0 True cell_header 1663.0 NaN False 1728.0 Rs. 3 1372.0 65.0 2 2
59 2942.0 1422.0 1967.092114 1398.202583 False 50.0 True cell_header 1937.0 NaN False 2003.0 Rs. 3 1372.0 66.0 2 2
60 2942.0 1422.0 2242.092114 1398.202583 False 50.0 True cell_header 2212.0 NaN False 2278.0 Rs. 3 1372.0 66.0 2 2
65 22148.0 1534.0 442.953766 1507.708868 False 51.0 True NaN 223.0 NaN False 664.0 STATE LEGISLATURES 18 1483.0 441.0 2 2
67 31709.0 1647.0 349.448264 1585.197042 False 108.0 True cell_higher_grouping 144.0 NaN True 577.0 101 Legislative Assembly\nNPNon Plan 35 1539.0 433.0 2 2
68 5405.0 1647.0 1160.456614 1621.225532 False 51.0 True cell_grouping 1103.0 NaN False 1217.0 Voted 5 1596.0 114.0 2 2
69 19647.0 1707.0 1395.623912 1645.971802 True 111.0 False cell_values 1279.0 NaN True 1494.0 17,20,13,766\n11,26,566 22 1596.0 215.0 2 2
70 19835.0 1707.0 1669.511772 1646.143181 True 111.0 False cell_values 1552.0 NaN True 1768.0 26,26,46,000\n39,97,000 22 1596.0 216.0 2 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
89 19715.0 2057.0 1944.900178 1996.003145 True 111.0 False cell_values 1827.0 NaN True 2043.0 28,41,78,000\n15,30,000 22 1946.0 216.0 2 2
90 19737.0 2057.0 2219.742818 1995.971272 True 111.0 False cell_values 2102.0 NaN True 2318.0 32,70,68,000\n16,31,000 22 1946.0 216.0 2 2
91 7042.0 2060.0 1143.765124 2029.896904 False 58.0 True cell_grouping 1070.0 NaN False 1219.0 Charged 7 2002.0 149.0 2 2
95 9013.0 2165.0 1121.750361 2139.285698 False 51.0 True cell_grouping 1027.0 NaN False 1217.0 Total 103 10 2114.0 190.0 2 2
96 11288.0 2171.0 1383.735383 2140.092310 True 57.0 False cell_values 1277.0 238923903.0 False 1492.0 23,89,23,903 12 2114.0 215.0 2 2
97 11312.0 2171.0 1658.689003 2140.116160 True 57.0 False cell_values 1552.0 342308000.0 False 1767.0 34,23,08,000 12 2114.0 215.0 2 2
98 11289.0 2171.0 1933.839844 2140.082735 True 57.0 False cell_values 1827.0 285708000.0 False 2042.0 28,57,08,000 12 2114.0 215.0 2 2
99 11315.0 2171.0 2208.660539 2140.126911 True 57.0 False cell_values 2102.0 328699000.0 False 2317.0 32,86,99,000 12 2114.0 215.0 2 2
103 15930.0 2290.0 1042.681293 2265.474451 False 51.0 True cell_grouping 874.0 NaN False 1218.0 Grand Total Gross 18 2239.0 344.0 2 2
104 11140.0 2296.0 1384.866876 2265.031688 True 57.0 False cell_values 1277.0 412064235.0 False 1492.0 41,20,64,235 12 2239.0 215.0 2 2
105 11232.0 2296.0 1659.334046 2265.156428 True 57.0 False cell_values 1553.0 608951000.0 False 1767.0 60,89,51,000 12 2239.0 214.0 2 2
106 11284.0 2296.0 1933.903314 2265.162708 True 57.0 False cell_values 1827.0 563709000.0 False 2042.0 56,37,09,000 12 2239.0 215.0 2 2
107 11232.0 2296.0 2209.386485 2265.143162 True 57.0 False cell_values 2103.0 620976000.0 False 2317.0 62,09,76,000 12 2239.0 214.0 2 2
111 5085.0 2403.0 1163.480826 2377.670403 False 51.0 True cell_grouping 1108.0 NaN False 1218.0 Voted 5 2352.0 110.0 2 2
112 19416.0 2463.0 1395.550783 2402.243871 True 111.0 False cell_values 1277.0 NaN True 1493.0 41,04,44,905\n16,19,330 22 2352.0 216.0 2 2
113 19639.0 2463.0 1669.992158 2402.199145 True 111.0 False cell_values 1553.0 NaN True 1768.0 60,34,24,000\n55,27,000 22 2352.0 215.0 2 2
114 19683.0 2463.0 1944.925926 2402.218107 True 111.0 False cell_values 1828.0 NaN True 2043.0 55,81,28,000\n55,81,000 22 2352.0 215.0 2 2
115 19667.0 2463.0 2220.060558 2402.283216 True 111.0 False cell_values 2103.0 NaN True 2318.0 61,50,66,000\n59,10,000 22 2352.0 215.0 2 2
116 7042.0 2466.0 1143.765124 2435.896904 False 58.0 True cell_grouping 1070.0 NaN False 1219.0 Charged 7 2408.0 149.0 2 2
120 11604.0 2572.0 1090.725009 2546.832644 False 51.0 True cell_grouping 969.0 NaN False 1218.0 NP Non Plan 12 2521.0 249.0 2 2
121 11140.0 2578.0 1384.866876 2547.031688 True 57.0 False cell_values 1277.0 412064235.0 False 1492.0 41,20,64,235 12 2521.0 215.0 2 2
122 11232.0 2578.0 1659.334046 2547.156428 True 57.0 False cell_values 1553.0 608951000.0 False 1767.0 60,89,51,000 12 2521.0 214.0 2 2
123 11284.0 2578.0 1933.903314 2547.162708 True 57.0 False cell_values 1827.0 563709000.0 False 2042.0 56,37,09,000 12 2521.0 215.0 2 2
124 11232.0 2578.0 2209.386485 2547.143162 True 57.0 False cell_values 2103.0 620976000.0 False 2317.0 62,09,76,000 12 2521.0 214.0 2 2
128 5085.0 2684.0 1163.480826 2658.670403 False 51.0 True cell_grouping 1108.0 NaN False 1218.0 Voted 5 2633.0 110.0 2 2
129 19416.0 2744.0 1395.550783 2683.243871 True 111.0 False cell_values 1277.0 NaN True 1493.0 41,04,44,905\n16,19,330 22 2633.0 216.0 2 2
130 19639.0 2744.0 1669.992158 2683.199145 True 111.0 False cell_values 1553.0 NaN True 1768.0 60,34,24,000\n55,27,000 22 2633.0 215.0 2 2
131 19683.0 2744.0 1944.925926 2683.218107 True 111.0 False cell_values 1828.0 NaN True 2043.0 55,81,28,000\n55,81,000 22 2633.0 215.0 2 2
132 19667.0 2744.0 2220.060558 2683.283216 True 111.0 False cell_values 2103.0 NaN True 2318.0 61,50,66,000\n59,10,000 22 2633.0 215.0 2 2
133 7042.0 2747.0 1143.765124 2716.896904 False 58.0 True cell_grouping 1070.0 NaN False 1219.0 Charged 7 2689.0 149.0 2 2

72 rows × 18 columns


In [83]:
def find_rows(labels):
    """
    Figure out the points where rows start and end.
    
    1. Headers would be the starting point. Bottom of the headers
    2. Each cell value will be separated by a row. We need to handle where text is merged.
    """
    rows = []
    rows.extend(labels[labels['label'] == 'cell_header'].aggregate({'top': min, 'bottom': max}).values.flatten())
    rows.extend(labels[labels['label'] == 'cell_values']['bottom'].unique())
    filtered_rows = []
    for index, row_sep in enumerate(rows):
        if index > 0:
            last_row_sep = rows[index - 1]
            if row_sep > (last_row_sep + 20):
                filtered_rows.append(row_sep)
        else:
            filtered_rows.append(row_sep)
    return filtered_rows

def plot_horizontal_lines(line_ys, img, x_start, x_end):
    waste_copy2 = img.copy()
    for line_y in line_ys:
        line_starting_points = (int(x_start), int(line_y))
        line_stopping_points = (int(x_end), int(line_y))
        cv2.line(waste_copy2, line_starting_points, line_stopping_points, (125,255,0), 3)
    plot_page(waste_copy2)

def find_common_headers(labels):
    unmarked_headers = labels[(labels.label == 'cell_header') & (pd.isnull(labels.header_index))]
    if len(unmarked_headers) > 0:
        header = unmarked_headers[unmarked_headers.top == unmarked_headers.top.min()]
        labels.ix[((labels.label == 'cell_header') & 
                   (labels.left.between(header.left.iloc[0] - header.width.iloc[0], header.right.iloc[0])) &
                   (pd.isnull(labels.header_index))
                  ), 'header_index'] = header.pos.iloc[0]
        return find_common_headers(labels)
    return labels
    
def cleanse_labels(labels):
    """
    We need to clean up labels to transform them into csv's
    
    1. Combine Headers so that they don't end up in new lines.
    2. Split label blocks with 2 values combined 
    """
    processed_labels = pd.DataFrame()
    
    for index, row in labels.iterrows():
        splitted_row = []
        if row.possible_row_merger == True and  row.label != 'cell_header':
            for index, value in enumerate(row.text.split('\n')):
                new_row = {}
                for col in row.index:
                    new_row[col] = row[col]
                new_height = row.height // len(row.text.split('\n'))
                new_row['height'] = new_height
                new_row['top'] = row.top + (index * new_height)
                new_row['bottom'] = new_row['top'] + new_height
                new_row['text'] = value
                new_row['possible_row_merger'] = False
                splitted_row.append(new_row)
            processed_labels = processed_labels.append(splitted_row)
        else:
            processed_labels = processed_labels.append(row)
    return find_common_headers(processed_labels)

def mark_titles(labels):
    labels.ix[((labels.is_text == True) & 
               (labels.centroid_x.between(1200, 1300)) &
               (pd.isnull(labels.label))
              ), 'label'] = 'title'
    return labels

def mark_summary(row):
    if row['is_text'] == True:
        summaries = re.findall("^[A-z, a-z]*\sRs. [0-9|,]*$", row['text'])
        if len(summaries) > 0:
            row['label'] = 'cell_summary'
    return row


def filter_out_footer(labels):
    bottom_boundary = labels[labels.label == 'cell_values']['top'].max()
    return labels[labels.bottom < bottom_boundary]

def extract_table_attributes_row_based(page_num, pdf):
    img_page = get_page_image_from_pdf(pdf, page_num, 'west_bengal_demand_1_5_page_1.png')
    image_height, image_width = img_page.shape
    horizontal_ratio = page_width / image_width
    vertical_ratio = page_height / image_height
    img_rlsa = rlsa(img_page, (20, 25))
    n_comp, labels, stats, centroids = cv2.connectedComponentsWithStats(img_rlsa)
    label_stats = get_label_stats(stats, centroids)
    label_stats['right'] = label_stats.left + label_stats.width
    label_stats['bottom'] = label_stats.top + label_stats.height
    label_stats = label_stats.apply(get_text_data, axis=1, args=[page_num, pdf_file_path,
                                                                 horizontal_ratio, vertical_ratio])
#     label_stats = mark_titles(label_stats)
    tables = draw_table_bounding_box(img_page, label_stats)
    if len(tables) == 0:
        print("No Tables found on page number {0}".format(page_num))
    all_text_labels = pd.DataFrame()
    index = 1
    for table_bounds in tables:
        table = img_rlsa[table_bounds['top']:table_bounds['bottom'],
                         table_bounds['left']:table_bounds['right']]
        labels_in_table = label_stats[(label_stats.top > (table_bounds['top']  + 5)) &
                                      (label_stats.bottom < (table_bounds['bottom'] - 5)) &
                                      (label_stats.left > (table_bounds['left'] - 5)) &
                                      (label_stats.right < (table_bounds['right'] + 5))]
#         labels_in_table = labels_in_table.apply(get_text_data, axis=1, args=[page_num, pdf_file_path,
#                                                                              horizontal_ratio, vertical_ratio])
        text_labels = labels_in_table[labels_in_table.text_length > 0].apply(mark_number_cells, axis=1)
        text_labels = text_labels.apply(mark_text_cells, axis=1, args=[text_labels])
        grouping_height_range = text_labels[text_labels.label == 'cell_grouping'].aggregate({'top': [min, max]}).values.flatten()
        text_labels = text_labels.apply(combine_headers, axis=1, args=[text_labels]).apply(find_higher_level_groupings, axis=1, args=[grouping_height_range])
        text_labels['header_index'] = None
        text_labels['pos'] = text_labels.index
        text_labels = cleanse_labels(text_labels)
        text_labels['page_num'] = page_num
        text_labels['table_num'] = index
        rows = find_rows(text_labels)
        row_ranges = zip(rows, rows[1:])
        for start, end in row_ranges:
            print(text_labels[text_labels.top.between(start, end - 5)].text.values)
        plot_horizontal_lines(rows, img_page, 150, 2350)
        index += 1
        all_text_labels = pd.concat([all_text_labels, text_labels])
        break
    return all_text_labels

In [84]:
extract_table_attributes_row_based(2, pdf)


DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated [ipykernel_launcher.py:72]
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-84-a300c0c59ef4> in <module>()
----> 1 extract_table_attributes_row_based(2, pdf)

<ipython-input-83-38f66dd4e7d5> in extract_table_attributes_row_based(page_num, pdf)
     97     label_stats = label_stats.apply(get_text_data, axis=1, args=[page_num, pdf_file_path,
     98                                                                  horizontal_ratio, vertical_ratio])
---> 99     label_stats = mark_titles(label_stats)
    100     tables = draw_table_bounding_box(img_page, label_stats)
    101     if len(tables) == 0:

<ipython-input-83-38f66dd4e7d5> in mark_titles(labels)
     70                (labels.centroid_x.between(1200, 1300)) &
     71                (pd.isnull(labels.label))
---> 72               ), 'label'] = 'title'
     73     return labels
     74 

/home/jayant/projects/cbga/cbga_parsers/lib/python2.7/site-packages/pandas/core/generic.pyc in __getattr__(self, name)
   3079             if name in self._info_axis:
   3080                 return self[name]
-> 3081             return object.__getattribute__(self, name)
   3082 
   3083     def __setattr__(self, name, value):

AttributeError: 'DataFrame' object has no attribute 'label'

In [ ]:
glabel_stats

In [ ]:
extract_table_attributes_row_based(22, pdf)

In [ ]:
extract_table_attributes_row_based(30, pdf)

There is a lot of overlap in the rows and it seems to work well in a couple of pages. Another observation page no 22 has a table separation without any vertical lines, there are two observations on that front :-

- Threre is a huge text block between them
- There is a lot of space between the tables due to the heading


Todo

- filtering out overlapping or close by lines -------x
- do the following label processing : - 
    - separate out labels where `\n` new line character is present into two labels -------x
    - merge all header labels into one ---------x
- a better table detection method.
- a process to convert the information into a consumable csv.
- Detect Titles and store the tables with title names

In [ ]:
label_stats

Figuring out title detection

There are a couple of assumptions that we can use to detect the title text : -

- Its usually center aligned
- It would be bigger then the other text

In [85]:
img_page = get_page_image_from_pdf(pdf, 2, 'west_bengal_demand_1_5_page_1.png')
image_height, image_width = img_page.shape
horizontal_ratio = page_width / image_width
vertical_ratio = page_height / image_height
img_rlsa = rlsa(img_page, (20, 25))
n_comp, labels, stats, centroids = cv2.connectedComponentsWithStats(img_rlsa)
label_stats = get_label_stats(stats, centroids)
label_stats['right'] = label_stats.left + label_stats.width
label_stats['bottom'] = label_stats.top + label_stats.height

In [86]:
img_page.shape


Out[86]:
(3508, 2479)

In [87]:
label_stats_with_text = label_stats.apply(get_text_data, axis=1, args=[2, pdf_file_path, horizontal_ratio, vertical_ratio])

In [88]:
label_stats_with_text[label_stats_with_text.is_text == True]


Out[88]:
left top width height area centroid_x centroid_y right bottom text text_length possible_row_merger comma_separated_numbers_present is_text number
4 885.0 103.0 690.0 62.0 42076.0 1229.127793 133.112392 1575.0 165.0 REVENUE EXPENDITURE 19 False False True NaN
6 1049.0 177.0 360.0 57.0 19420.0 1225.853708 205.540783 1409.0 234.0 DEMAND No. 01 13 False False True NaN
8 901.0 246.0 658.0 66.0 34215.0 1225.751308 277.009908 1559.0 312.0 Legislative Assembly Secretariat 32 False False True NaN
10 827.0 315.0 805.0 66.0 41069.0 1232.540627 346.063040 1632.0 381.0 AGeneral Services (a) Organs of State 38 False False True NaN
12 798.0 383.0 864.0 66.0 44366.0 1228.446445 413.491277 1662.0 449.0 Head of Account : 2011 State Legislatures 42 False False True NaN
14 142.0 502.0 462.0 65.0 25409.0 379.711559 531.492739 604.0 567.0 Voted Rs. 61,50,66,000 22 False False True NaN
15 1016.0 502.0 446.0 66.0 24421.0 1242.827566 532.310348 1462.0 568.0 Charged Rs. 59,10,000 21 False False True NaN
16 1869.0 502.0 448.0 65.0 24894.0 2098.311762 531.411304 2317.0 567.0 Total Rs. 62,09,76,000 22 False False True NaN
20 1833.0 620.0 206.0 59.0 9843.0 1933.956416 648.331505 2039.0 679.0 Charged Rs. 11 False False True NaN
21 1593.0 621.0 174.0 51.0 8152.0 1678.513494 646.764475 1767.0 672.0 Voted Rs. 9 False False True NaN
22 2154.0 621.0 162.0 51.0 7692.0 2232.769501 646.766381 2316.0 672.0 Total Rs. 9 False False True NaN
26 618.0 733.0 320.0 58.0 14864.0 775.951023 760.459634 938.0 791.0 Gross Expenditure 17 False False True NaN
30 570.0 789.0 149.0 51.0 6448.0 640.050093 816.051489 719.0 840.0 Deduct 7 False False True NaN
31 721.0 789.0 190.0 51.0 8624.0 813.858071 816.339402 911.0 840.0 Recoveries 10 False False True NaN
36 720.0 902.0 284.0 58.0 13501.0 857.588253 929.038368 1004.0 960.0 Net Expenditure 15 False False True NaN
43 911.0 1025.0 638.0 59.0 37051.0 1229.191007 1053.613938 1549.0 1084.0 REVENUE EXPENDITURE 19 False False True NaN
45 1019.0 1089.0 421.0 51.0 20972.0 1230.401821 1113.837784 1440.0 1140.0 ABSTRACT ACCOUNT 16 False False True NaN
49 1613.0 1202.0 127.0 59.0 6212.0 1675.519961 1229.796523 1740.0 1261.0 Budget 6 False False True NaN
50 1875.0 1202.0 140.0 51.0 6578.0 1943.899970 1228.496047 2015.0 1253.0 Revised 7 False False True NaN
51 2162.0 1202.0 128.0 59.0 6254.0 2225.048129 1229.807963 2290.0 1261.0 Budget 6 False False True NaN
52 1313.0 1258.0 179.0 107.0 15406.0 1401.864858 1316.907763 1492.0 1365.0 Actuals,\n20152016 17 True False True NaN
53 1587.0 1258.0 179.0 107.0 16239.0 1675.118111 1314.721535 1766.0 1365.0 Estimate,\n20162017 18 True False True NaN
54 1862.0 1258.0 179.0 107.0 16225.0 1950.056086 1314.757966 2041.0 1365.0 Estimate,\n20162017 18 True False True NaN
55 2137.0 1258.0 179.0 107.0 16260.0 2225.510578 1314.892005 2316.0 1365.0 Estimate,\n20172018 18 True False True NaN
57 1388.0 1372.0 65.0 50.0 2913.0 1417.744936 1398.115002 1453.0 1422.0 Rs. 3 False False True NaN
58 1663.0 1372.0 65.0 50.0 2913.0 1692.744936 1398.115002 1728.0 1422.0 Rs. 3 False False True NaN
59 1937.0 1372.0 66.0 50.0 2942.0 1967.092114 1398.202583 2003.0 1422.0 Rs. 3 False False True NaN
60 2212.0 1372.0 66.0 50.0 2942.0 2242.092114 1398.202583 2278.0 1422.0 Rs. 3 False False True NaN
65 223.0 1483.0 441.0 51.0 22148.0 442.953766 1507.708868 664.0 1534.0 STATE LEGISLATURES 18 False False True NaN
67 144.0 1539.0 433.0 108.0 31709.0 349.448264 1585.197042 577.0 1647.0 101 Legislative Assembly\nNPNon Plan 35 True False True NaN
68 1103.0 1596.0 114.0 51.0 5405.0 1160.456614 1621.225532 1217.0 1647.0 Voted 5 False False True NaN
73 1070.0 1652.0 149.0 58.0 7042.0 1143.765124 1679.896904 1219.0 1710.0 Charged 7 False False True NaN
77 1027.0 1764.0 189.0 51.0 8887.0 1120.465736 1789.383594 1216.0 1815.0 Total 101 10 False False True NaN
85 144.0 1889.0 457.0 108.0 32860.0 356.778119 1934.333932 601.0 1997.0 103 Legislative Secretariat\nNPNon Plan 38 True False True NaN
86 1103.0 1946.0 114.0 51.0 5405.0 1160.456614 1971.225532 1217.0 1997.0 Voted 5 False False True NaN
91 1070.0 2002.0 149.0 58.0 7042.0 1143.765124 2029.896904 1219.0 2060.0 Charged 7 False False True NaN
95 1027.0 2114.0 190.0 51.0 9013.0 1121.750361 2139.285698 1217.0 2165.0 Total 103 10 False False True NaN
103 874.0 2239.0 344.0 51.0 15930.0 1042.681293 2265.474451 1218.0 2290.0 Grand Total Gross 18 False False True NaN
111 1108.0 2352.0 110.0 51.0 5085.0 1163.480826 2377.670403 1218.0 2403.0 Voted 5 False False True NaN
116 1070.0 2408.0 149.0 58.0 7042.0 1143.765124 2435.896904 1219.0 2466.0 Charged 7 False False True NaN
120 969.0 2521.0 249.0 51.0 11604.0 1090.725009 2546.832644 1218.0 2572.0 NP Non Plan 12 False False True NaN
128 1108.0 2633.0 110.0 51.0 5085.0 1163.480826 2658.670403 1218.0 2684.0 Voted 5 False False True NaN
133 1070.0 2689.0 149.0 58.0 7042.0 1143.765124 2716.896904 1219.0 2747.0 Charged 7 False False True NaN
137 135.0 2957.0 2157.0 86.0 81564.0 1227.328956 2981.084621 2292.0 3043.0 ______________________________________________... 140 True False True NaN

In [89]:
plot_page(img_page)



In [90]:
label_stats_with_text[(label_stats_with_text.is_text == True) & (label_stats_with_text.centroid_x.between(1200, 1300))]


Out[90]:
left top width height area centroid_x centroid_y right bottom text text_length possible_row_merger comma_separated_numbers_present is_text number
4 885.0 103.0 690.0 62.0 42076.0 1229.127793 133.112392 1575.0 165.0 REVENUE EXPENDITURE 19 False False True NaN
6 1049.0 177.0 360.0 57.0 19420.0 1225.853708 205.540783 1409.0 234.0 DEMAND No. 01 13 False False True NaN
8 901.0 246.0 658.0 66.0 34215.0 1225.751308 277.009908 1559.0 312.0 Legislative Assembly Secretariat 32 False False True NaN
10 827.0 315.0 805.0 66.0 41069.0 1232.540627 346.063040 1632.0 381.0 AGeneral Services (a) Organs of State 38 False False True NaN
12 798.0 383.0 864.0 66.0 44366.0 1228.446445 413.491277 1662.0 449.0 Head of Account : 2011 State Legislatures 42 False False True NaN
15 1016.0 502.0 446.0 66.0 24421.0 1242.827566 532.310348 1462.0 568.0 Charged Rs. 59,10,000 21 False False True NaN
43 911.0 1025.0 638.0 59.0 37051.0 1229.191007 1053.613938 1549.0 1084.0 REVENUE EXPENDITURE 19 False False True NaN
45 1019.0 1089.0 421.0 51.0 20972.0 1230.401821 1113.837784 1440.0 1140.0 ABSTRACT ACCOUNT 16 False False True NaN
137 135.0 2957.0 2157.0 86.0 81564.0 1227.328956 2981.084621 2292.0 3043.0 ______________________________________________... 140 True False True NaN

In [91]:
label_stats_with_text = label_stats_with_text[label_stats_with_text.text_length > 0].apply(mark_number_cells, axis=1)
label_stats_with_text = label_stats_with_text.apply(mark_text_cells, axis=1, args=[label_stats_with_text])
label_stats_with_text = label_stats_with_text.apply(combine_headers, axis=1, args=[label_stats_with_text])
label_stats_with_text = label_stats_with_text.apply(mark_summary, axis=1)
label_stats_with_text = filter_out_footer(label_stats_with_text)

In [92]:
label_stats_with_text


Out[92]:
area bottom centroid_x centroid_y comma_separated_numbers_present height is_text label left number possible_row_merger right text text_length top width
4 42076.0 165.0 1229.127793 133.112392 False 62.0 True cell_header 885.0 NaN False 1575.0 REVENUE EXPENDITURE 19 103.0 690.0
6 19420.0 234.0 1225.853708 205.540783 False 57.0 True NaN 1049.0 NaN False 1409.0 DEMAND No. 01 13 177.0 360.0
8 34215.0 312.0 1225.751308 277.009908 False 66.0 True NaN 901.0 NaN False 1559.0 Legislative Assembly Secretariat 32 246.0 658.0
10 41069.0 381.0 1232.540627 346.063040 False 66.0 True NaN 827.0 NaN False 1632.0 AGeneral Services (a) Organs of State 38 315.0 805.0
12 44366.0 449.0 1228.446445 413.491277 False 66.0 True NaN 798.0 NaN False 1662.0 Head of Account : 2011 State Legislatures 42 383.0 864.0
14 25409.0 567.0 379.711559 531.492739 False 65.0 True cell_summary 142.0 NaN False 604.0 Voted Rs. 61,50,66,000 22 502.0 462.0
15 24421.0 568.0 1242.827566 532.310348 False 66.0 True cell_summary 1016.0 NaN False 1462.0 Charged Rs. 59,10,000 21 502.0 446.0
16 24894.0 567.0 2098.311762 531.411304 False 65.0 True cell_summary 1869.0 NaN False 2317.0 Total Rs. 62,09,76,000 22 502.0 448.0
20 9843.0 679.0 1933.956416 648.331505 False 59.0 True cell_header 1833.0 NaN False 2039.0 Charged Rs. 11 620.0 206.0
21 8152.0 672.0 1678.513494 646.764475 False 51.0 True cell_header 1593.0 NaN False 1767.0 Voted Rs. 9 621.0 174.0
22 7692.0 672.0 2232.769501 646.766381 False 51.0 True cell_header 2154.0 NaN False 2316.0 Total Rs. 9 621.0 162.0
26 14864.0 791.0 775.951023 760.459634 False 58.0 True cell_grouping 618.0 NaN False 938.0 Gross Expenditure 17 733.0 320.0
27 19317.0 846.0 1671.421235 782.727390 True 113.0 False cell_values 1553.0 NaN True 1767.0 61,50,66,000\n2,61,000 21 733.0 214.0
28 8747.0 790.0 1957.048588 759.089859 True 57.0 False cell_values 1873.0 5910000.0 False 2042.0 59,10,000 9 733.0 169.0
29 19346.0 846.0 2221.338261 782.709604 True 113.0 False cell_values 2103.0 NaN True 2317.0 62,09,76,000\n2,61,000 21 733.0 214.0
30 6448.0 840.0 640.050093 816.051489 False 51.0 True cell_grouping 570.0 NaN False 719.0 Deduct 7 789.0 149.0
31 8624.0 840.0 813.858071 816.339402 False 51.0 True cell_grouping 721.0 NaN False 911.0 Recoveries 10 789.0 190.0
32 1311.0 840.0 2016.483600 824.467582 False 30.0 False cell_values 1995.0 NaN False 2039.0 ... 3 810.0 44.0
36 13501.0 960.0 857.588253 929.038368 False 58.0 True cell_grouping 720.0 NaN False 1004.0 Net Expenditure 15 902.0 284.0
37 11192.0 959.0 1659.620622 928.225965 True 57.0 False cell_values 1553.0 614805000.0 False 1767.0 61,48,05,000 12 902.0 214.0
38 8747.0 959.0 1957.048588 928.089859 True 57.0 False cell_values 1873.0 5910000.0 False 2042.0 59,10,000 9 902.0 169.0
39 11226.0 959.0 2209.406378 928.139319 True 57.0 False cell_values 2103.0 620715000.0 False 2317.0 62,07,15,000 12 902.0 214.0
43 37051.0 1084.0 1229.191007 1053.613938 False 59.0 True NaN 911.0 NaN False 1549.0 REVENUE EXPENDITURE 19 1025.0 638.0
45 20972.0 1140.0 1230.401821 1113.837784 False 51.0 True NaN 1019.0 NaN False 1440.0 ABSTRACT ACCOUNT 16 1089.0 421.0
49 6212.0 1261.0 1675.519961 1229.796523 False 59.0 True cell_header 1613.0 NaN False 1740.0 Budget 6 1202.0 127.0
50 6578.0 1253.0 1943.899970 1228.496047 False 51.0 True cell_header 1875.0 NaN False 2015.0 Revised 7 1202.0 140.0
51 6254.0 1261.0 2225.048129 1229.807963 False 59.0 True cell_header 2162.0 NaN False 2290.0 Budget 6 1202.0 128.0
52 15406.0 1365.0 1401.864858 1316.907763 False 107.0 True cell_header 1313.0 NaN True 1492.0 Actuals,\n20152016 17 1258.0 179.0
53 16239.0 1365.0 1675.118111 1314.721535 False 107.0 True cell_header 1587.0 NaN True 1766.0 Estimate,\n20162017 18 1258.0 179.0
54 16225.0 1365.0 1950.056086 1314.757966 False 107.0 True cell_header 1862.0 NaN True 2041.0 Estimate,\n20162017 18 1258.0 179.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
89 19715.0 2057.0 1944.900178 1996.003145 True 111.0 False cell_values 1827.0 NaN True 2043.0 28,41,78,000\n15,30,000 22 1946.0 216.0
90 19737.0 2057.0 2219.742818 1995.971272 True 111.0 False cell_values 2102.0 NaN True 2318.0 32,70,68,000\n16,31,000 22 1946.0 216.0
91 7042.0 2060.0 1143.765124 2029.896904 False 58.0 True cell_grouping 1070.0 NaN False 1219.0 Charged 7 2002.0 149.0
95 9013.0 2165.0 1121.750361 2139.285698 False 51.0 True cell_grouping 1027.0 NaN False 1217.0 Total 103 10 2114.0 190.0
96 11288.0 2171.0 1383.735383 2140.092310 True 57.0 False cell_values 1277.0 238923903.0 False 1492.0 23,89,23,903 12 2114.0 215.0
97 11312.0 2171.0 1658.689003 2140.116160 True 57.0 False cell_values 1552.0 342308000.0 False 1767.0 34,23,08,000 12 2114.0 215.0
98 11289.0 2171.0 1933.839844 2140.082735 True 57.0 False cell_values 1827.0 285708000.0 False 2042.0 28,57,08,000 12 2114.0 215.0
99 11315.0 2171.0 2208.660539 2140.126911 True 57.0 False cell_values 2102.0 328699000.0 False 2317.0 32,86,99,000 12 2114.0 215.0
103 15930.0 2290.0 1042.681293 2265.474451 False 51.0 True cell_grouping 874.0 NaN False 1218.0 Grand Total Gross 18 2239.0 344.0
104 11140.0 2296.0 1384.866876 2265.031688 True 57.0 False cell_values 1277.0 412064235.0 False 1492.0 41,20,64,235 12 2239.0 215.0
105 11232.0 2296.0 1659.334046 2265.156428 True 57.0 False cell_values 1553.0 608951000.0 False 1767.0 60,89,51,000 12 2239.0 214.0
106 11284.0 2296.0 1933.903314 2265.162708 True 57.0 False cell_values 1827.0 563709000.0 False 2042.0 56,37,09,000 12 2239.0 215.0
107 11232.0 2296.0 2209.386485 2265.143162 True 57.0 False cell_values 2103.0 620976000.0 False 2317.0 62,09,76,000 12 2239.0 214.0
111 5085.0 2403.0 1163.480826 2377.670403 False 51.0 True cell_grouping 1108.0 NaN False 1218.0 Voted 5 2352.0 110.0
112 19416.0 2463.0 1395.550783 2402.243871 True 111.0 False cell_values 1277.0 NaN True 1493.0 41,04,44,905\n16,19,330 22 2352.0 216.0
113 19639.0 2463.0 1669.992158 2402.199145 True 111.0 False cell_values 1553.0 NaN True 1768.0 60,34,24,000\n55,27,000 22 2352.0 215.0
114 19683.0 2463.0 1944.925926 2402.218107 True 111.0 False cell_values 1828.0 NaN True 2043.0 55,81,28,000\n55,81,000 22 2352.0 215.0
115 19667.0 2463.0 2220.060558 2402.283216 True 111.0 False cell_values 2103.0 NaN True 2318.0 61,50,66,000\n59,10,000 22 2352.0 215.0
116 7042.0 2466.0 1143.765124 2435.896904 False 58.0 True cell_grouping 1070.0 NaN False 1219.0 Charged 7 2408.0 149.0
120 11604.0 2572.0 1090.725009 2546.832644 False 51.0 True cell_grouping 969.0 NaN False 1218.0 NP Non Plan 12 2521.0 249.0
121 11140.0 2578.0 1384.866876 2547.031688 True 57.0 False cell_values 1277.0 412064235.0 False 1492.0 41,20,64,235 12 2521.0 215.0
122 11232.0 2578.0 1659.334046 2547.156428 True 57.0 False cell_values 1553.0 608951000.0 False 1767.0 60,89,51,000 12 2521.0 214.0
123 11284.0 2578.0 1933.903314 2547.162708 True 57.0 False cell_values 1827.0 563709000.0 False 2042.0 56,37,09,000 12 2521.0 215.0
124 11232.0 2578.0 2209.386485 2547.143162 True 57.0 False cell_values 2103.0 620976000.0 False 2317.0 62,09,76,000 12 2521.0 214.0
128 5085.0 2684.0 1163.480826 2658.670403 False 51.0 True cell_grouping 1108.0 NaN False 1218.0 Voted 5 2633.0 110.0
129 19416.0 2744.0 1395.550783 2683.243871 True 111.0 False cell_values 1277.0 NaN True 1493.0 41,04,44,905\n16,19,330 22 2633.0 216.0
130 19639.0 2744.0 1669.992158 2683.199145 True 111.0 False cell_values 1553.0 NaN True 1768.0 60,34,24,000\n55,27,000 22 2633.0 215.0
131 19683.0 2744.0 1944.925926 2683.218107 True 111.0 False cell_values 1828.0 NaN True 2043.0 55,81,28,000\n55,81,000 22 2633.0 215.0
132 19667.0 2744.0 2220.060558 2683.283216 True 111.0 False cell_values 2103.0 NaN True 2318.0 61,50,66,000\n59,10,000 22 2633.0 215.0
133 7042.0 2747.0 1143.765124 2716.896904 False 58.0 True cell_grouping 1070.0 NaN False 1219.0 Charged 7 2689.0 149.0

83 rows × 16 columns


In [93]:
label_stats_with_text[['text', 'label']].values


Out[93]:
array([['REVENUE EXPENDITURE', 'cell_header'],
       ['DEMAND No. 01', nan],
       ['Legislative Assembly Secretariat', nan],
       ['AGeneral Services  (a) Organs of State', nan],
       ['Head of Account : 2011  State Legislatures', nan],
       ['Voted Rs. 61,50,66,000', 'cell_summary'],
       ['Charged Rs. 59,10,000', 'cell_summary'],
       ['Total Rs. 62,09,76,000', 'cell_summary'],
       ['Charged Rs.', 'cell_header'],
       ['Voted Rs.', 'cell_header'],
       ['Total Rs.', 'cell_header'],
       ['Gross Expenditure', 'cell_grouping'],
       ['61,50,66,000\n2,61,000', 'cell_values'],
       ['59,10,000', 'cell_values'],
       ['62,09,76,000\n2,61,000', 'cell_values'],
       ['Deduct ', 'cell_grouping'],
       ['Recoveries', 'cell_grouping'],
       ['...', 'cell_values'],
       ['Net Expenditure', 'cell_grouping'],
       ['61,48,05,000', 'cell_values'],
       ['59,10,000', 'cell_values'],
       ['62,07,15,000', 'cell_values'],
       ['REVENUE EXPENDITURE', nan],
       ['ABSTRACT ACCOUNT', nan],
       ['Budget', 'cell_header'],
       ['Revised', 'cell_header'],
       ['Budget', 'cell_header'],
       ['Actuals,\n20152016', 'cell_header'],
       ['Estimate,\n20162017', 'cell_header'],
       ['Estimate,\n20162017', 'cell_header'],
       ['Estimate,\n20172018', 'cell_header'],
       ['Rs.', 'cell_header'],
       ['Rs.', 'cell_header'],
       ['Rs.', 'cell_header'],
       ['Rs.', 'cell_header'],
       ['02 ', 'cell_values'],
       ['STATE LEGISLATURES', nan],
       ['101 Legislative Assembly\nNPNon Plan', nan],
       ['Voted', 'cell_grouping'],
       ['17,20,13,766\n11,26,566', 'cell_values'],
       ['26,26,46,000\n39,97,000', 'cell_values'],
       ['27,39,50,000\n40,51,000', 'cell_values'],
       ['28,79,98,000\n42,79,000', 'cell_values'],
       ['Charged', 'cell_grouping'],
       ['Total  101', 'cell_grouping'],
       ['17,31,40,332', 'cell_values'],
       ['26,66,43,000', 'cell_values'],
       ['27,80,01,000', 'cell_values'],
       ['29,22,77,000', 'cell_values'],
       ['103 Legislative Secretariat\nNPNon Plan', nan],
       ['Voted', 'cell_grouping'],
       ['23,84,31,139\n4,92,764', 'cell_values'],
       ['34,07,78,000\n15,30,000', 'cell_values'],
       ['28,41,78,000\n15,30,000', 'cell_values'],
       ['32,70,68,000\n16,31,000', 'cell_values'],
       ['Charged', 'cell_grouping'],
       ['Total  103', 'cell_grouping'],
       ['23,89,23,903', 'cell_values'],
       ['34,23,08,000', 'cell_values'],
       ['28,57,08,000', 'cell_values'],
       ['32,86,99,000', 'cell_values'],
       ['Grand Total  Gross', 'cell_grouping'],
       ['41,20,64,235', 'cell_values'],
       ['60,89,51,000', 'cell_values'],
       ['56,37,09,000', 'cell_values'],
       ['62,09,76,000', 'cell_values'],
       ['Voted', 'cell_grouping'],
       ['41,04,44,905\n16,19,330', 'cell_values'],
       ['60,34,24,000\n55,27,000', 'cell_values'],
       ['55,81,28,000\n55,81,000', 'cell_values'],
       ['61,50,66,000\n59,10,000', 'cell_values'],
       ['Charged', 'cell_grouping'],
       ['NP  Non Plan', 'cell_grouping'],
       ['41,20,64,235', 'cell_values'],
       ['60,89,51,000', 'cell_values'],
       ['56,37,09,000', 'cell_values'],
       ['62,09,76,000', 'cell_values'],
       ['Voted', 'cell_grouping'],
       ['41,04,44,905\n16,19,330', 'cell_values'],
       ['60,34,24,000\n55,27,000', 'cell_values'],
       ['55,81,28,000\n55,81,000', 'cell_values'],
       ['61,50,66,000\n59,10,000', 'cell_values'],
       ['Charged', 'cell_grouping']], dtype=object)

In [94]:
label_stats_with_text[label_stats_with_text.label == 'cell_values']['top'].max()


Out[94]:
2633.0

In [95]:
label_stats_with_text[label_stats_with_text.bottom < 3041].shape


Out[95]:
(83, 16)

Using titles to separate out tables

We need to figure out if there exists two groups of titles


In [96]:
titles = label_stats_with_text[label_stats_with_text.label == 'title']
titles


Out[96]:
area bottom centroid_x centroid_y comma_separated_numbers_present height is_text label left number possible_row_merger right text text_length top width

In [97]:
titles['next_diff'] = titles.top - titles.top.shift(1)
titles[titles.next_diff > titles.next_diff.mean()]


Out[97]:
area bottom centroid_x centroid_y comma_separated_numbers_present height is_text label left number possible_row_merger right text text_length top width next_diff

In [98]:
label_stats_with_text[label_stats_with_text.top < 1025]


Out[98]:
area bottom centroid_x centroid_y comma_separated_numbers_present height is_text label left number possible_row_merger right text text_length top width
4 42076.0 165.0 1229.127793 133.112392 False 62.0 True cell_header 885.0 NaN False 1575.0 REVENUE EXPENDITURE 19 103.0 690.0
6 19420.0 234.0 1225.853708 205.540783 False 57.0 True NaN 1049.0 NaN False 1409.0 DEMAND No. 01 13 177.0 360.0
8 34215.0 312.0 1225.751308 277.009908 False 66.0 True NaN 901.0 NaN False 1559.0 Legislative Assembly Secretariat 32 246.0 658.0
10 41069.0 381.0 1232.540627 346.063040 False 66.0 True NaN 827.0 NaN False 1632.0 AGeneral Services (a) Organs of State 38 315.0 805.0
12 44366.0 449.0 1228.446445 413.491277 False 66.0 True NaN 798.0 NaN False 1662.0 Head of Account : 2011 State Legislatures 42 383.0 864.0
14 25409.0 567.0 379.711559 531.492739 False 65.0 True cell_summary 142.0 NaN False 604.0 Voted Rs. 61,50,66,000 22 502.0 462.0
15 24421.0 568.0 1242.827566 532.310348 False 66.0 True cell_summary 1016.0 NaN False 1462.0 Charged Rs. 59,10,000 21 502.0 446.0
16 24894.0 567.0 2098.311762 531.411304 False 65.0 True cell_summary 1869.0 NaN False 2317.0 Total Rs. 62,09,76,000 22 502.0 448.0
20 9843.0 679.0 1933.956416 648.331505 False 59.0 True cell_header 1833.0 NaN False 2039.0 Charged Rs. 11 620.0 206.0
21 8152.0 672.0 1678.513494 646.764475 False 51.0 True cell_header 1593.0 NaN False 1767.0 Voted Rs. 9 621.0 174.0
22 7692.0 672.0 2232.769501 646.766381 False 51.0 True cell_header 2154.0 NaN False 2316.0 Total Rs. 9 621.0 162.0
26 14864.0 791.0 775.951023 760.459634 False 58.0 True cell_grouping 618.0 NaN False 938.0 Gross Expenditure 17 733.0 320.0
27 19317.0 846.0 1671.421235 782.727390 True 113.0 False cell_values 1553.0 NaN True 1767.0 61,50,66,000\n2,61,000 21 733.0 214.0
28 8747.0 790.0 1957.048588 759.089859 True 57.0 False cell_values 1873.0 5910000.0 False 2042.0 59,10,000 9 733.0 169.0
29 19346.0 846.0 2221.338261 782.709604 True 113.0 False cell_values 2103.0 NaN True 2317.0 62,09,76,000\n2,61,000 21 733.0 214.0
30 6448.0 840.0 640.050093 816.051489 False 51.0 True cell_grouping 570.0 NaN False 719.0 Deduct 7 789.0 149.0
31 8624.0 840.0 813.858071 816.339402 False 51.0 True cell_grouping 721.0 NaN False 911.0 Recoveries 10 789.0 190.0
32 1311.0 840.0 2016.483600 824.467582 False 30.0 False cell_values 1995.0 NaN False 2039.0 ... 3 810.0 44.0
36 13501.0 960.0 857.588253 929.038368 False 58.0 True cell_grouping 720.0 NaN False 1004.0 Net Expenditure 15 902.0 284.0
37 11192.0 959.0 1659.620622 928.225965 True 57.0 False cell_values 1553.0 614805000.0 False 1767.0 61,48,05,000 12 902.0 214.0
38 8747.0 959.0 1957.048588 928.089859 True 57.0 False cell_values 1873.0 5910000.0 False 2042.0 59,10,000 9 902.0 169.0
39 11226.0 959.0 2209.406378 928.139319 True 57.0 False cell_values 2103.0 620715000.0 False 2317.0 62,07,15,000 12 902.0 214.0

In [99]:
label_stats_with_text[label_stats_with_text.top >= 1025]


Out[99]:
area bottom centroid_x centroid_y comma_separated_numbers_present height is_text label left number possible_row_merger right text text_length top width
43 37051.0 1084.0 1229.191007 1053.613938 False 59.0 True NaN 911.0 NaN False 1549.0 REVENUE EXPENDITURE 19 1025.0 638.0
45 20972.0 1140.0 1230.401821 1113.837784 False 51.0 True NaN 1019.0 NaN False 1440.0 ABSTRACT ACCOUNT 16 1089.0 421.0
49 6212.0 1261.0 1675.519961 1229.796523 False 59.0 True cell_header 1613.0 NaN False 1740.0 Budget 6 1202.0 127.0
50 6578.0 1253.0 1943.899970 1228.496047 False 51.0 True cell_header 1875.0 NaN False 2015.0 Revised 7 1202.0 140.0
51 6254.0 1261.0 2225.048129 1229.807963 False 59.0 True cell_header 2162.0 NaN False 2290.0 Budget 6 1202.0 128.0
52 15406.0 1365.0 1401.864858 1316.907763 False 107.0 True cell_header 1313.0 NaN True 1492.0 Actuals,\n20152016 17 1258.0 179.0
53 16239.0 1365.0 1675.118111 1314.721535 False 107.0 True cell_header 1587.0 NaN True 1766.0 Estimate,\n20162017 18 1258.0 179.0
54 16225.0 1365.0 1950.056086 1314.757966 False 107.0 True cell_header 1862.0 NaN True 2041.0 Estimate,\n20162017 18 1258.0 179.0
55 16260.0 1365.0 2225.510578 1314.892005 False 107.0 True cell_header 2137.0 NaN True 2316.0 Estimate,\n20172018 18 1258.0 179.0
57 2913.0 1422.0 1417.744936 1398.115002 False 50.0 True cell_header 1388.0 NaN False 1453.0 Rs. 3 1372.0 65.0
58 2913.0 1422.0 1692.744936 1398.115002 False 50.0 True cell_header 1663.0 NaN False 1728.0 Rs. 3 1372.0 65.0
59 2942.0 1422.0 1967.092114 1398.202583 False 50.0 True cell_header 1937.0 NaN False 2003.0 Rs. 3 1372.0 66.0
60 2942.0 1422.0 2242.092114 1398.202583 False 50.0 True cell_header 2212.0 NaN False 2278.0 Rs. 3 1372.0 66.0
64 3324.0 1534.0 176.897714 1508.656438 False 51.0 False cell_values 143.0 2.0 False 219.0 02 3 1483.0 76.0
65 22148.0 1534.0 442.953766 1507.708868 False 51.0 True NaN 223.0 NaN False 664.0 STATE LEGISLATURES 18 1483.0 441.0
67 31709.0 1647.0 349.448264 1585.197042 False 108.0 True NaN 144.0 NaN True 577.0 101 Legislative Assembly\nNPNon Plan 35 1539.0 433.0
68 5405.0 1647.0 1160.456614 1621.225532 False 51.0 True cell_grouping 1103.0 NaN False 1217.0 Voted 5 1596.0 114.0
69 19647.0 1707.0 1395.623912 1645.971802 True 111.0 False cell_values 1279.0 NaN True 1494.0 17,20,13,766\n11,26,566 22 1596.0 215.0
70 19835.0 1707.0 1669.511772 1646.143181 True 111.0 False cell_values 1552.0 NaN True 1768.0 26,26,46,000\n39,97,000 22 1596.0 216.0
71 19739.0 1707.0 1944.744668 1645.944830 True 111.0 False cell_values 1827.0 NaN True 2043.0 27,39,50,000\n40,51,000 22 1596.0 216.0
72 19758.0 1707.0 2219.727098 1645.939366 True 111.0 False cell_values 2102.0 NaN True 2318.0 28,79,98,000\n42,79,000 22 1596.0 216.0
73 7042.0 1710.0 1143.765124 1679.896904 False 58.0 True cell_grouping 1070.0 NaN False 1219.0 Charged 7 1652.0 149.0
77 8887.0 1815.0 1120.465736 1789.383594 False 51.0 True cell_grouping 1027.0 NaN False 1216.0 Total 101 10 1764.0 189.0
78 11128.0 1821.0 1384.561916 1790.102894 True 57.0 False cell_values 1279.0 173140332.0 False 1492.0 17,31,40,332 12 1764.0 213.0
79 11284.0 1821.0 1658.860510 1790.116005 True 57.0 False cell_values 1552.0 266643000.0 False 1767.0 26,66,43,000 12 1764.0 215.0
80 11273.0 1821.0 1933.878116 1790.075047 True 57.0 False cell_values 1827.0 278001000.0 False 2042.0 27,80,01,000 12 1764.0 215.0
81 11264.0 1821.0 2208.825994 1790.029652 True 57.0 False cell_values 2102.0 292277000.0 False 2317.0 29,22,77,000 12 1764.0 215.0
85 32860.0 1997.0 356.778119 1934.333932 False 108.0 True NaN 144.0 NaN True 601.0 103 Legislative Secretariat\nNPNon Plan 38 1889.0 457.0
86 5405.0 1997.0 1160.456614 1971.225532 False 51.0 True cell_grouping 1103.0 NaN False 1217.0 Voted 5 1946.0 114.0
87 18661.0 2057.0 1397.109426 1994.193559 True 111.0 False cell_values 1277.0 NaN True 1492.0 23,84,31,139\n4,92,764 21 1946.0 215.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
89 19715.0 2057.0 1944.900178 1996.003145 True 111.0 False cell_values 1827.0 NaN True 2043.0 28,41,78,000\n15,30,000 22 1946.0 216.0
90 19737.0 2057.0 2219.742818 1995.971272 True 111.0 False cell_values 2102.0 NaN True 2318.0 32,70,68,000\n16,31,000 22 1946.0 216.0
91 7042.0 2060.0 1143.765124 2029.896904 False 58.0 True cell_grouping 1070.0 NaN False 1219.0 Charged 7 2002.0 149.0
95 9013.0 2165.0 1121.750361 2139.285698 False 51.0 True cell_grouping 1027.0 NaN False 1217.0 Total 103 10 2114.0 190.0
96 11288.0 2171.0 1383.735383 2140.092310 True 57.0 False cell_values 1277.0 238923903.0 False 1492.0 23,89,23,903 12 2114.0 215.0
97 11312.0 2171.0 1658.689003 2140.116160 True 57.0 False cell_values 1552.0 342308000.0 False 1767.0 34,23,08,000 12 2114.0 215.0
98 11289.0 2171.0 1933.839844 2140.082735 True 57.0 False cell_values 1827.0 285708000.0 False 2042.0 28,57,08,000 12 2114.0 215.0
99 11315.0 2171.0 2208.660539 2140.126911 True 57.0 False cell_values 2102.0 328699000.0 False 2317.0 32,86,99,000 12 2114.0 215.0
103 15930.0 2290.0 1042.681293 2265.474451 False 51.0 True cell_grouping 874.0 NaN False 1218.0 Grand Total Gross 18 2239.0 344.0
104 11140.0 2296.0 1384.866876 2265.031688 True 57.0 False cell_values 1277.0 412064235.0 False 1492.0 41,20,64,235 12 2239.0 215.0
105 11232.0 2296.0 1659.334046 2265.156428 True 57.0 False cell_values 1553.0 608951000.0 False 1767.0 60,89,51,000 12 2239.0 214.0
106 11284.0 2296.0 1933.903314 2265.162708 True 57.0 False cell_values 1827.0 563709000.0 False 2042.0 56,37,09,000 12 2239.0 215.0
107 11232.0 2296.0 2209.386485 2265.143162 True 57.0 False cell_values 2103.0 620976000.0 False 2317.0 62,09,76,000 12 2239.0 214.0
111 5085.0 2403.0 1163.480826 2377.670403 False 51.0 True cell_grouping 1108.0 NaN False 1218.0 Voted 5 2352.0 110.0
112 19416.0 2463.0 1395.550783 2402.243871 True 111.0 False cell_values 1277.0 NaN True 1493.0 41,04,44,905\n16,19,330 22 2352.0 216.0
113 19639.0 2463.0 1669.992158 2402.199145 True 111.0 False cell_values 1553.0 NaN True 1768.0 60,34,24,000\n55,27,000 22 2352.0 215.0
114 19683.0 2463.0 1944.925926 2402.218107 True 111.0 False cell_values 1828.0 NaN True 2043.0 55,81,28,000\n55,81,000 22 2352.0 215.0
115 19667.0 2463.0 2220.060558 2402.283216 True 111.0 False cell_values 2103.0 NaN True 2318.0 61,50,66,000\n59,10,000 22 2352.0 215.0
116 7042.0 2466.0 1143.765124 2435.896904 False 58.0 True cell_grouping 1070.0 NaN False 1219.0 Charged 7 2408.0 149.0
120 11604.0 2572.0 1090.725009 2546.832644 False 51.0 True cell_grouping 969.0 NaN False 1218.0 NP Non Plan 12 2521.0 249.0
121 11140.0 2578.0 1384.866876 2547.031688 True 57.0 False cell_values 1277.0 412064235.0 False 1492.0 41,20,64,235 12 2521.0 215.0
122 11232.0 2578.0 1659.334046 2547.156428 True 57.0 False cell_values 1553.0 608951000.0 False 1767.0 60,89,51,000 12 2521.0 214.0
123 11284.0 2578.0 1933.903314 2547.162708 True 57.0 False cell_values 1827.0 563709000.0 False 2042.0 56,37,09,000 12 2521.0 215.0
124 11232.0 2578.0 2209.386485 2547.143162 True 57.0 False cell_values 2103.0 620976000.0 False 2317.0 62,09,76,000 12 2521.0 214.0
128 5085.0 2684.0 1163.480826 2658.670403 False 51.0 True cell_grouping 1108.0 NaN False 1218.0 Voted 5 2633.0 110.0
129 19416.0 2744.0 1395.550783 2683.243871 True 111.0 False cell_values 1277.0 NaN True 1493.0 41,04,44,905\n16,19,330 22 2633.0 216.0
130 19639.0 2744.0 1669.992158 2683.199145 True 111.0 False cell_values 1553.0 NaN True 1768.0 60,34,24,000\n55,27,000 22 2633.0 215.0
131 19683.0 2744.0 1944.925926 2683.218107 True 111.0 False cell_values 1828.0 NaN True 2043.0 55,81,28,000\n55,81,000 22 2633.0 215.0
132 19667.0 2744.0 2220.060558 2683.283216 True 111.0 False cell_values 2103.0 NaN True 2318.0 61,50,66,000\n59,10,000 22 2633.0 215.0
133 7042.0 2747.0 1143.765124 2716.896904 False 58.0 True cell_grouping 1070.0 NaN False 1219.0 Charged 7 2689.0 149.0

61 rows × 16 columns


In [ ]: